Skip to content

Commit

Permalink
Fix for fft performance regression
Browse files Browse the repository at this point in the history
  • Loading branch information
mtaillefumier authored and oschuett committed Jan 28, 2022
1 parent 2124558 commit 21c490e
Show file tree
Hide file tree
Showing 8 changed files with 567 additions and 630 deletions.
5 changes: 0 additions & 5 deletions src/grid/cpu/collocation_integration.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@
#include <string.h>
#include <unistd.h>

#ifdef __GRID_CUDA
#include <cublas_v2.h>
#include <cuda.h>
#endif

#include "../common/grid_common.h"
#include "collocation_integration.h"
#include "non_orthorombic_corrections.h"
Expand Down
87 changes: 0 additions & 87 deletions src/grid/cpu/collocation_integration.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,93 +12,12 @@
#include <stdlib.h>
#include <string.h>

#ifdef __GRID_CUDA
#include <cublas_v2.h>
#include <cuda.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

#include "../cpu/tensor_local.h"
#include "cpu_private_header.h"
#ifdef __GRID_CUDA
typedef struct pgf_list_gpu_ {
/* number of devices */
int number_of_devices;

/* device_id */
int device_id;
/* */
int lmax;

/* maximum size of the batch */
int batch_size;

/* size of the batch */
int list_length;

/* number of elements occupied in the buffer */
size_t coef_dynamic_alloc_size_gpu_;

/* total size of the buffer */
size_t coef_alloc_size_gpu_;

/* size of the previously allocated coefficent table */
size_t coef_previous_alloc_size_;

/* size of the previously allocated grid */
size_t data_gpu_old_size_;

double *coef_cpu_;
double *coef_gpu_;

/* Info about the cubes */
int *coef_offset_cpu_;
double3 *rp_cpu_;
double *radius_cpu_, *radius_gpu_;

int *coef_offset_gpu_;
double3 *rp_gpu_;

/* angular momentum */
int *lmax_cpu_;
int *lmax_gpu_;

double *zeta_cpu_;
double *zeta_gpu_;

double *data_gpu_;

cudaStream_t stream;
cudaEvent_t event;
bool job_finished;

cublasHandle_t blas_handle;

int3 grid_size, grid_lower_corner_position, grid_full_size, window_shift,
window_size;

int cmax;
bool zeroing_grid;

/* size of the halo when the grid is split over multiple mpi ranks */
int *border_mask_cpu_;
int *border_mask_gpu_;

_task *task_list_cpu_;
_task *task_list_gpu_;
int3 border_width;

struct pgf_list_gpu_ *next;
/* if true, the grid on the gpu should be reallocated */
bool durty;
/* true if the buffers are used for computing already */
bool running;
bool apply_cutoff;
} pgf_list_gpu;
#endif

typedef struct collocation_integration_ {
/* number of compute device */
Expand Down Expand Up @@ -162,12 +81,6 @@ typedef struct collocation_integration_ {
int lmax_diff[2];

int cmax;

#ifdef __GRID_CUDA
pgf_list_gpu *worker_list;
int worker_list_size;
#endif

} collocation_integration;

extern struct collocation_integration_ *collocate_create_handle(void);
Expand Down
156 changes: 0 additions & 156 deletions src/pw/gpu/cuda/cuda_fft_private_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,162 +100,6 @@ static void error_check(cudaError_t cudaError, int line,
}
}

class fft_plan {
private:
cufftHandle plan_;
int size_[3] = {0, 0, 0};
int dim_{3};
int batch_size_ = -1;
// forward or backward / reverse
enum fft_direction direction_ { FFT_UNKOWN };
cudaStream_t stream_;
bool is_initialized_{false};

public:
fft_plan() {}

fft_plan(const std::vector<int> &fft_size__, const int dim__,
const int batch_size__, const enum fft_direction direction__)
: dim_(dim__), batch_size_(batch_size__) {
int inembed[2] = {0, 0};
int onembed[2] = {0, 0};
int istride, idist, ostride, odist;

direction_ = direction__;

switch (dim__) {
case 3: {
size_[0] = fft_size__[0];
size_[1] = fft_size__[1];
size_[2] = fft_size__[2];
} break;
case 2: {
size_[0] = fft_size__[1];
size_[1] = fft_size__[0];
size_[2] = batch_size__;
inembed[0] = fft_size__[1];
inembed[1] = fft_size__[0];
onembed[0] = fft_size__[1];
onembed[1] = fft_size__[0];
batch_size_ = batch_size__;

if (direction_ == CUFFT_FORWARD) {
istride = batch_size__;
idist = 1;
ostride = 1;
odist = size_[0] * size_[1];
} else {
istride = 1;
idist = size_[0] * size_[1];
ostride = batch_size__;
odist = 1;
}
} break;
case 1: {
size_[0] = fft_size__[0];
size_[1] = 1;
size_[2] = batch_size__;
batch_size_ = batch_size__;
if (direction_ == CUFFT_FORWARD) {
istride = batch_size__;
idist = 1;
ostride = 1;
odist = fft_size__[0];
} else {
istride = 1;
idist = fft_size__[0];
ostride = batch_size__;
odist = 1;
}
break;
}
default:
abort();
break;
}
if (dim_ == 3) {
fft_error_check(
cufftPlan3d(&plan_, size_[2], size_[1], size_[0], CUFFT_Z2Z),
__LINE__, __FILE__);
} else {
fft_error_check(cufftPlanMany(&plan_, dim_, &size_[0], inembed, istride,
idist, onembed, ostride, odist, CUFFT_Z2Z,
batch_size_),
__LINE__, __FILE__);
}

is_initialized_ = true;
}

void set_stream(const cudaStream_t &cuda_stream) {
stream_ = cuda_stream;
fft_error_check(cufftSetStream(plan_, stream_), __LINE__, __FILE__);
}

~fft_plan() {
if (is_initialized_)
destroy();
}

void destroy() {
error_check(cudaStreamSynchronize(stream_), __LINE__, __FILE__);
fft_error_check(cufftDestroy(plan_), __LINE__, __FILE__);
is_initialized_ = false;
}

/// run the fft on the data inplace
void execute_fft(const enum fft_direction direction__,
cufftDoubleComplex *data__) {
fft_error_check(cufftExecZ2Z(plan_, data__, data__, direction__), __LINE__,
__FILE__);
}

/// run the fft on the data out of place
void execute_fft(const enum fft_direction direction__,
cufftDoubleComplex *dataIn__,
cufftDoubleComplex *dataOut__) {
cufftResult_t cErr;
// set the stream

fft_error_check(cufftExecZ2Z(plan_, dataIn__, dataOut__, direction__),
__LINE__, __FILE__);
}

/// check if this plane can be used to execute the fft
bool is_it_valid(std::vector<int> size__, const int dim__, const int batch__,
const enum fft_direction direction__) const {
if (dim_ != dim__)
return false;
if (batch__ != batch_size_)
return false;
switch (dim__) {
case 3:
return ((size_[0] != size__[0]) || (size_[1] != size__[1]) ||
(size_[2] == size__[2]));
break;
case 2:
return ((size_[0] != size__[0]) || (size_[1] != size__[1]));
break;
case 1:
return (size_[0] != size__[0]);
break;
default:
return false;
break;
}

// check for the direction

if ((direction_ != direction__) && (dim_ != 3)) {
return false;
} else {
return true;
}
}

bool is_initialized() const { return is_initialized_; }
};

static void blasCreate(blasHandle_t *handle__) {
blas_error_check(cublasCreate(handle__), __LINE__, __FILE__);
}
Expand Down
18 changes: 18 additions & 0 deletions src/pw/gpu/cuda/pw_cuda_z.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,21 @@ void gpu_gather(cudaStream_t &stream__, const double scale__, int num_points__,
pw_gather_z<double><<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(
scale__, num_points__, map_index__, dataIn__, dataOut__);
}

void real_to_complex(cudaStream_t &stream__, const int length__,
const double *src__, double *const dst__) {
dim3 blocksPerGrid, threadsPerBlock;
blocksPerGrid.x = length__ / 512 + ((length__ % 512) != 0);
threadsPerBlock.x = 512;
real_to_complex_gpu<double>
<<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(length__, src__, dst__);
}

void complex_to_real(cudaStream_t &stream__, const int length__,
const double *src__, double *const dst__) {
dim3 blocksPerGrid, threadsPerBlock;
blocksPerGrid.x = length__ / 512 + ((length__ % 512) != 0);
threadsPerBlock.x = 512;
complex_to_real_gpu<double>
<<<blocksPerGrid, threadsPerBlock, 0, stream__>>>(length__, src__, dst__);
}

0 comments on commit 21c490e

Please sign in to comment.