Initial commit to resolve merge conflicts

fsx950223 · Dec 6, 2021 · b3d2b2f · b3d2b2f
1 parent 9efea3e
commit b3d2b2f
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 182 deletions.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -262,17 +262,11 @@ extern "C" void* _mlir_ciface_tf_jit_compile(
       ctx->op_device_context()->stream()->GetCudaComputeCapability();
   architectures.push_back(absl::StrCat("sm_", cc.major, cc.minor));
 #elif defined(TENSORFLOW_USE_ROCM)
-<<<<<<< HEAD
-  architectures.push_back(
-      ctx->op_device_context()->stream()->parent()
-          ->GetDeviceDescription().rocm_amdgpu_gcn_arch_name());
-=======
   architectures.push_back(ctx->op_device_context()
                               ->stream()
                               ->parent()
                               ->GetDeviceDescription()
                               .rocm_amdgpu_gcn_arch_name());
->>>>>>> google_upstream/master
 #endif
 
   // Construct `SmallVector`s from arguments.

diff --git a/tensorflow/core/kernels/linalg/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -237,16 +237,9 @@ class QrOpGpu : public AsyncOpKernel {
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
             context,
-<<<<<<< HEAD
-            solver->Geam(transa, transb, n,
-                         full_matrices_ ? m : min_size, &alpha,
-                         &input_transposed_reshaped(batch, 0, 0), m, &beta,
-                         dummy, n, &r_reshaped(batch, 0, 0), n),
-=======
             solver->Geam(transa, transb, n, full_matrices_ ? m : min_size,
                          &alpha, &input_transposed_reshaped(batch, 0, 0), m,
                          &beta, dummy, n, &r_reshaped(batch, 0, 0), n),
->>>>>>> google_upstream/master
             done);
       }
     }

diff --git a/tensorflow/core/util/gpu_solvers.h b/tensorflow/core/util/gpu_solvers.h
@@ -102,12 +102,8 @@ inline typename ROCmComplexT<T>::type* ROCmComplex(T* p) {
 // Template to give the Rocblas adjoint operation for real and complex types.
 template <typename T>
 rocblas_operation RocblasAdjointOp() {
-<<<<<<< HEAD
-  return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose : rocblas_operation_transpose;
-=======
   return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose
                                         : rocblas_operation_transpose;
->>>>>>> google_upstream/master
 }
 #endif
 
@@ -272,6 +268,14 @@ class GpuSolver {
                       const Scalar* const host_a_inverse_dev_ptrs[], int ldainv,
                       DeviceLapackInfo* dev_lapack_info, int batch_size);
 
+  // Computes matrix inverses for a batch of small matrices with size n < 32.
+  // Returns Status::OK() if the kernel was launched successfully.
+  template <typename Scalar>
+  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                       const Scalar* const host_a_inverse_dev_ptrs[],
+                       int ldainv, DeviceLapackInfo* dev_lapack_info,
+                       int batch_size);
+
   // Cholesky factorization
   // Computes the Cholesky factorization A = L * L^H for a batch of small
   // matrices.
@@ -287,80 +291,36 @@ class GpuSolver {
 
   // QR factorization.
   // Computes QR factorization A = Q * R.
-<<<<<<< HEAD
-  // Returns Status::OK() if the kernel was launched successfully.
-  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
-=======
->>>>>>> google_upstream/master
   template <typename Scalar>
   Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
                int* dev_lapack_info);
 
-<<<<<<< HEAD
-
-  // This function performs the matrix-matrix addition/transposition
-  //   C = alpha * op(A) + beta * op(B).
-  // Returns Status::OK() if the kernel was launched successfully.  See:
-  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
-  // NOTE(ebrevdo): Does not support in-place transpose of non-square
-  // matrices.
-=======
   // This function performs the matrix-matrix addition/transposition
   //   C = alpha * op(A) + beta * op(B).
->>>>>>> google_upstream/master
   template <typename Scalar>
   Status Geam(rocblas_operation transa, rocblas_operation transb, int m, int n,
               const Scalar* alpha, /* host or device pointer */
               const Scalar* A, int lda,
               const Scalar* beta, /* host or device pointer */
-<<<<<<< HEAD
-              const Scalar* B, int ldb, Scalar* C,
-              int ldc);
-=======
               const Scalar* B, int ldb, Scalar* C, int ldc);
->>>>>>> google_upstream/master
 
   // Overwrite matrix C by product of C and the unitary Householder matrix Q.
   // The Householder matrix Q is represented by the output from Geqrf in dev_a
   // and dev_tau.
-<<<<<<< HEAD
-  // Returns Status::OK() if the kernel was launched successfully.
-  template <typename Scalar>
-  Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n,
-               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,
-=======
   template <typename Scalar>
   Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n, int k,
                const Scalar* dev_a, int lda, const Scalar* dev_tau,
->>>>>>> google_upstream/master
                Scalar* dev_c, int ldc, int* dev_lapack_info);
 
   // Overwrites QR factorization produced by Geqrf by the unitary Householder
   // matrix Q. On input, the Householder matrix Q is represented by the output
   // from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
   // first n columns of Q. Requires m >= n >= 0.
-<<<<<<< HEAD
-  // Returns Status::OK() if the kernel was launched successfully.
-=======
->>>>>>> google_upstream/master
   template <typename Scalar>
   Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
                const Scalar* dev_tau, int* dev_lapack_info);
 
-<<<<<<< HEAD
-
-  // Computes matrix inverses for a batch of small matrices with size n < 32.
-  // Returns Status::OK() if the kernel was launched successfully.
-  template <typename Scalar>
-  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
-                       const Scalar* const host_a_inverse_dev_ptrs[],
-                       int ldainv, DeviceLapackInfo* dev_lapack_info,
-                       int batch_size);
-
-#else //GOOGLE_CUDA
-=======
 #else  // GOOGLE_CUDA
->>>>>>> google_upstream/master
   // ====================================================================
   // Wrappers for cuSolverDN and cuBlas solvers start here.
   //
@@ -531,6 +491,7 @@ class GpuSolver {
                      const Scalar* const dev_Aarray[], int lda,
                      Scalar* dev_Barray[], int ldb, int batch_size);
 #endif
+
  private:
   OpKernelContext* context_;  // not owned.
 #if GOOGLE_CUDA

diff --git a/tensorflow/core/util/rocm_solvers.cc b/tensorflow/core/util/rocm_solvers.cc
@@ -222,12 +222,8 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 #define TF_CALL_LAPACK_TYPES(m) \
   m(float, s) m(double, d) m(std::complex<float>, c) m(std::complex<double>, z)
 #define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, s) m(double, d)
-<<<<<<< HEAD
-#define TF_CALL_LAPACK_TYPES_NO_REAL(m) m(std::complex<float>, c) m(std::complex<double>, z)
-=======
 #define TF_CALL_LAPACK_TYPES_NO_REAL(m) \
   m(std::complex<float>, c) m(std::complex<double>, z)
->>>>>>> google_upstream/master
 
 #define BLAS_SOLVER_FN(method, type_prefix) \
   wrap::rocblas##_##type_prefix##method
@@ -249,81 +245,16 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 
 TF_CALL_LAPACK_TYPES(GETRF_INSTANCE);
 
-#define GEQRF_INSTANCE(Scalar, type_prefix)                                      \
-  template <>                                                                    \
-  Status GpuSolver::Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau, \
-                int* dev_lapack_info){                                           \
-      mutex_lock lock(handle_map_mutex);                                         \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(geqrf, type_prefix)(                  \
-          rocm_blas_handle_, m, n, reinterpret_cast<ROCmScalar*>(dev_A), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau)));             \
-      return Status::OK();                                                       \
-  }
-
-TF_CALL_LAPACK_TYPES(GEQRF_INSTANCE);
-
-#define UMMQR_INSTANCE(Scalar, type_prefix)                                          \
-  template <>                                                                        \
-  Status GpuSolver::Unmqr(rocblas_side side, rocblas_operation trans, int m, int n,  \
-               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,           \
-               Scalar* dev_c, int ldc, int* dev_lapack_info){                         \
-      mutex_lock lock(handle_map_mutex);                                              \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                          \
-      ScratchSpace<uint8> dev_a_copy =                                                 \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) * m*k, "",                    \
-        /*on host */ false);                                                           \
-      if (!CopyHostToDevice(context_, dev_a_copy.mutable_data(), dev_a,                \
-                          dev_a_copy.bytes())) {                                        \
-      return errors::Internal("Unmqr: Failed to copy ptrs to device");                  \
-      }                                                                                 \
-      ScratchSpace<uint8> dev_tau_copy =                                                \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "",                     \
-        /*on host */ false);                                                            \
-      if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,             \
-                          dev_tau_copy.bytes())) {                                      \
-      return errors::Internal("Unmqr: Failed to copy ptrs to device");                  \
-      }                                                                                   \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(unmqr, type_prefix)(                               \
-          rocm_blas_handle_,side,trans, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a_copy.mutable_data()), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data()),reinterpret_cast<ROCmScalar*>(dev_c), ldc));             \
-      return Status::OK();    \
-}
-
-TF_CALL_LAPACK_TYPES_NO_REAL(UMMQR_INSTANCE);
-
-#define UNGQR_INSTANCE(Scalar, type_prefix)                                          \
-  template <>                                                                        \
-  Status GpuSolver::Ungqr(int m, int n, int k, Scalar* dev_a, int lda,               \
-               const Scalar* dev_tau, int* dev_lapack_info){                         \
-      mutex_lock lock(handle_map_mutex);                                              \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                          \
-      ScratchSpace<uint8> dev_tau_copy =                                                \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "",                     \
-        /*on host */ false);                                                            \
-      if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,             \
-                          dev_tau_copy.bytes())) {                                      \
-      return errors::Internal("Ungqr: Failed to copy ptrs to device");                  \
-      }                                                                                   \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(ungqr, type_prefix)(                               \
-          rocm_blas_handle_, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data())));             \
-      return Status::OK();    \
-}
-
-TF_CALL_LAPACK_TYPES_NO_REAL(UNGQR_INSTANCE);
-
-
-#define POTRF_INSTANCE(Scalar, type_prefix)                                   \
-  template <>                                                                 \
-  Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A,    \
-  int lda, int* dev_lapack_info) {                                            \
-    mutex_lock lock(handle_map_mutex);                                        \
-    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)(                 \
-        rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A),     \
-        lda, dev_lapack_info));                                               \
-    return Status::OK();                                                      \
+#define POTRF_INSTANCE(Scalar, type_prefix)                                    \
+  template <>                                                                  \
+  Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A,     \
+                                  int lda, int* dev_lapack_info) {             \
+    mutex_lock lock(handle_map_mutex);                                         \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)(                  \
+        rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
+        dev_lapack_info));                                                     \
+    return Status::OK();                                                       \
   }
 
 #define GEQRF_INSTANCE(Scalar, type_prefix)                                 \
@@ -441,43 +372,6 @@ TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);
 
 TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);
 
-#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix)                            \
-  template <>                                                                  \
-  Status GpuSolver::GetriBatched<Scalar>(                                      \
-                      int n, const Scalar* const host_a_dev_ptrs[], int lda, \
-                      const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[], \
-                      int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) {                                                  \
-    mutex_lock lock(handle_map_mutex);                                         \
-    rocblas_stride stride = n;                                                 \
-    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
-    ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs,    \
-                          dev_a.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                                   \
-    ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(), host_a_inverse_dev_ptrs,    \
-                          dev_a_inverse.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                          \
-    ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots,    \
-                          pivots.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                          \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)(          \
-        rocm_blas_handle_, n,                                                  \
-        reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda,              \
-        reinterpret_cast<int*>(pivots.mutable_data()),                           \
-        stride, dev_lapack_info->mutable_data(), batch_size));                   \
-    return Status::OK();                                                       \
-  }
-
-TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
-
 #define POTRF_BATCHED_INSTANCE(Scalar, type_prefix)                           \
   template <>                                                                 \
   Status GpuSolver::PotrfBatched<Scalar>(                                     \
@@ -530,6 +424,43 @@ TF_CALL_LAPACK_TYPES(POTRF_BATCHED_INSTANCE);
 
 TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);
 
+#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix)                           \
+  template <>                                                                 \
+  Status GpuSolver::GetriBatched<Scalar>(                                     \
+      int n, const Scalar* const host_a_dev_ptrs[], int lda,                  \
+      const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[],   \
+      int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) {        \
+    mutex_lock lock(handle_map_mutex);                                        \
+    rocblas_stride stride = n;                                                \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
+    ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>(                 \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs,    \
+                          dev_a.bytes())) {                                   \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>(         \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(),             \
+                          host_a_inverse_dev_ptrs, dev_a_inverse.bytes())) {  \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>(                \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots,        \
+                          pivots.bytes())) {                                  \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)(         \
+        rocm_blas_handle_, n,                                                 \
+        reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda,            \
+        reinterpret_cast<int*>(pivots.mutable_data()), stride,                \
+        dev_lapack_info->mutable_data(), batch_size));                        \
+    return Status::OK();                                                      \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
+
 // Allocates a temporary tensor. The GpuSolver object maintains a
 // TensorReference to the underlying Tensor to prevent it from being deallocated
 // prematurely.