Fix for fft performance regression

cp2k · Jan 28, 2022 · 21c490e · 21c490e
1 parent 2124558
commit 21c490e
Show file tree

Hide file tree

Showing 8 changed files with 567 additions and 630 deletions.
diff --git a/src/grid/cpu/collocation_integration.c b/src/grid/cpu/collocation_integration.c
@@ -11,11 +11,6 @@
 #include <string.h>
 #include <unistd.h>
 
-#ifdef __GRID_CUDA
-#include <cublas_v2.h>
-#include <cuda.h>
-#endif
-
 #include "../common/grid_common.h"
 #include "collocation_integration.h"
 #include "non_orthorombic_corrections.h"

diff --git a/src/grid/cpu/collocation_integration.h b/src/grid/cpu/collocation_integration.h
@@ -12,93 +12,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef __GRID_CUDA
-#include <cublas_v2.h>
-#include <cuda.h>
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "../cpu/tensor_local.h"
 #include "cpu_private_header.h"
-#ifdef __GRID_CUDA
-typedef struct pgf_list_gpu_ {
-  /* number of devices */
-  int number_of_devices;
-
-  /* device_id */
-  int device_id;
-  /* */
-  int lmax;
-
-  /* maximum size of the batch */
-  int batch_size;
-
-  /* size of the batch */
-  int list_length;
-
-  /* number of elements occupied in the buffer */
-  size_t coef_dynamic_alloc_size_gpu_;
-
-  /*  total size of the buffer */
-  size_t coef_alloc_size_gpu_;
-
-  /* size of the previously allocated coefficent table */
-  size_t coef_previous_alloc_size_;
-
-  /* size of the previously allocated grid */
-  size_t data_gpu_old_size_;
-
-  double *coef_cpu_;
-  double *coef_gpu_;
-
-  /* Info about the cubes */
-  int *coef_offset_cpu_;
-  double3 *rp_cpu_;
-  double *radius_cpu_, *radius_gpu_;
-
-  int *coef_offset_gpu_;
-  double3 *rp_gpu_;
-
-  /* angular momentum */
-  int *lmax_cpu_;
-  int *lmax_gpu_;
-
-  double *zeta_cpu_;
-  double *zeta_gpu_;
-
-  double *data_gpu_;
-
-  cudaStream_t stream;
-  cudaEvent_t event;
-  bool job_finished;
-
-  cublasHandle_t blas_handle;
-
-  int3 grid_size, grid_lower_corner_position, grid_full_size, window_shift,
-      window_size;
-
-  int cmax;
-  bool zeroing_grid;
-
-  /* size of the halo when the grid is split over multiple mpi ranks */
-  int *border_mask_cpu_;
-  int *border_mask_gpu_;
-
-  _task *task_list_cpu_;
-  _task *task_list_gpu_;
-  int3 border_width;
-
-  struct pgf_list_gpu_ *next;
-  /* if true, the grid on the gpu should be reallocated */
-  bool durty;
-  /* true if the buffers are used for computing already */
-  bool running;
-  bool apply_cutoff;
-} pgf_list_gpu;
-#endif
 
 typedef struct collocation_integration_ {
   /* number of compute device */
@@ -162,12 +81,6 @@ typedef struct collocation_integration_ {
   int lmax_diff[2];
 
   int cmax;
-
-#ifdef __GRID_CUDA
-  pgf_list_gpu *worker_list;
-  int worker_list_size;
-#endif
-
 } collocation_integration;
 
 extern struct collocation_integration_ *collocate_create_handle(void);

diff --git a/src/pw/gpu/cuda/cuda_fft_private_header.hpp b/src/pw/gpu/cuda/cuda_fft_private_header.hpp
@@ -100,162 +100,6 @@ static void error_check(cudaError_t cudaError, int line,
   }
 }
 
-class fft_plan {
-private:
-  cufftHandle plan_;
-  int size_[3] = {0, 0, 0};
-  int dim_{3};
-  int batch_size_ = -1;
-  // forward or backward / reverse
-  enum fft_direction direction_ { FFT_UNKOWN };
-  cudaStream_t stream_;
-  bool is_initialized_{false};
-
-public:
-  fft_plan() {}
-
-  fft_plan(const std::vector<int> &fft_size__, const int dim__,
-           const int batch_size__, const enum fft_direction direction__)
-      : dim_(dim__), batch_size_(batch_size__) {
-    int inembed[2] = {0, 0};
-    int onembed[2] = {0, 0};
-    int istride, idist, ostride, odist;
-
-    direction_ = direction__;
-
-    switch (dim__) {
-    case 3: {
-      size_[0] = fft_size__[0];
-      size_[1] = fft_size__[1];
-      size_[2] = fft_size__[2];
-    } break;
-    case 2: {
-      size_[0] = fft_size__[1];
-      size_[1] = fft_size__[0];
-      size_[2] = batch_size__;
-      inembed[0] = fft_size__[1];
-      inembed[1] = fft_size__[0];
-      onembed[0] = fft_size__[1];
-      onembed[1] = fft_size__[0];
-      batch_size_ = batch_size__;
-
-      if (direction_ == CUFFT_FORWARD) {
-        istride = batch_size__;
-        idist = 1;
-        ostride = 1;
-        odist = size_[0] * size_[1];
-      } else {
-        istride = 1;
-        idist = size_[0] * size_[1];
-        ostride = batch_size__;
-        odist = 1;
-      }
-    } break;
-    case 1: {
-      size_[0] = fft_size__[0];
-      size_[1] = 1;
-      size_[2] = batch_size__;
-      batch_size_ = batch_size__;
-      if (direction_ == CUFFT_FORWARD) {
-        istride = batch_size__;
-        idist = 1;
-        ostride = 1;
-        odist = fft_size__[0];
-      } else {
-        istride = 1;
-        idist = fft_size__[0];
-        ostride = batch_size__;
-        odist = 1;
-      }
-      break;
-    }
-    default:
-      abort();
-      break;
-    }
-    if (dim_ == 3) {
-      fft_error_check(
-          cufftPlan3d(&plan_, size_[2], size_[1], size_[0], CUFFT_Z2Z),
-          __LINE__, __FILE__);
-    } else {
-      fft_error_check(cufftPlanMany(&plan_, dim_, &size_[0], inembed, istride,
-                                    idist, onembed, ostride, odist, CUFFT_Z2Z,
-                                    batch_size_),
-                      __LINE__, __FILE__);
-    }
-
-    is_initialized_ = true;
-  }
-
-  void set_stream(const cudaStream_t &cuda_stream) {
-    stream_ = cuda_stream;
-    fft_error_check(cufftSetStream(plan_, stream_), __LINE__, __FILE__);
-  }
-
-  ~fft_plan() {
-    if (is_initialized_)
-      destroy();
-  }
-
-  void destroy() {
-    error_check(cudaStreamSynchronize(stream_), __LINE__, __FILE__);
-    fft_error_check(cufftDestroy(plan_), __LINE__, __FILE__);
-    is_initialized_ = false;
-  }
-
-  /// run the fft on the data inplace
-  void execute_fft(const enum fft_direction direction__,
-                   cufftDoubleComplex *data__) {
-    fft_error_check(cufftExecZ2Z(plan_, data__, data__, direction__), __LINE__,
-                    __FILE__);
-  }
-
-  /// run the fft on the data out of place
-  void execute_fft(const enum fft_direction direction__,
-                   cufftDoubleComplex *dataIn__,
-                   cufftDoubleComplex *dataOut__) {
-    cufftResult_t cErr;
-    // set the stream
-
-    fft_error_check(cufftExecZ2Z(plan_, dataIn__, dataOut__, direction__),
-                    __LINE__, __FILE__);
-  }
-
-  /// check if this plane can be used to execute the fft
-  bool is_it_valid(std::vector<int> size__, const int dim__, const int batch__,
-                   const enum fft_direction direction__) const {
-    if (dim_ != dim__)
-      return false;
-    if (batch__ != batch_size_)
-      return false;
-    switch (dim__) {
-    case 3:
-      return ((size_[0] != size__[0]) || (size_[1] != size__[1]) ||
-              (size_[2] == size__[2]));
-      break;
-    case 2:
-      return ((size_[0] != size__[0]) || (size_[1] != size__[1]));
-      break;
-    case 1:
-      return (size_[0] != size__[0]);
-      break;
-    default:
-      return false;
-      break;
-    }
-
-    // check for the direction
-
-    if ((direction_ != direction__) && (dim_ != 3)) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  bool is_initialized() const { return is_initialized_; }
-};
-
 static void blasCreate(blasHandle_t *handle__) {
   blas_error_check(cublasCreate(handle__), __LINE__, __FILE__);
 }

diff --git a/src/pw/gpu/cuda/pw_cuda_z.cu b/src/pw/gpu/cuda/pw_cuda_z.cu
@@ -38,3 +38,21 @@ void gpu_gather(cudaStream_t &stream__, const double scale__, int num_points__,
   pw_gather_z<double><<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(
       scale__, num_points__, map_index__, dataIn__, dataOut__);
 }
+
+void real_to_complex(cudaStream_t &stream__, const int length__,
+                     const double *src__, double *const dst__) {
+  dim3 blocksPerGrid, threadsPerBlock;
+  blocksPerGrid.x = length__ / 512 + ((length__ % 512) != 0);
+  threadsPerBlock.x = 512;
+  real_to_complex_gpu<double>
+      <<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(length__, src__, dst__);
+}
+
+void complex_to_real(cudaStream_t &stream__, const int length__,
+                     const double *src__, double *const dst__) {
+  dim3 blocksPerGrid, threadsPerBlock;
+  blocksPerGrid.x = length__ / 512 + ((length__ % 512) != 0);
+  threadsPerBlock.x = 512;
+  complex_to_real_gpu<double>
+      <<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(length__, src__, dst__);
+}