Permalink
Browse files

Fix several issues

- Optimize example fmm_pts, for the case where source and target points
  are the same.

- Fix warnings in cheb_nodes.txx, fmm_cheb.txx

- In fmm_cheb.txx, add call to FMM_Pts<FMMNode>::Source2Up

- In fmm_pts.hpp, add staging_buffer which was remove earlier to
  optimize memory usage.

- Fix issue with pt_cnt for upward pass translations, where, interaction
  list was not constructed for octants with no target points..

- Add single precision versions of cuda functions.

- Increase BC_LEVELS to 60. This is needed for double precision
  accuracy.

- Add strong scaling script for particle FMM.
  • Loading branch information...
dmalhotra committed Feb 1, 2015
1 parent 515a6a0 commit 987aa2361dc901983ca347f02b2574edabb0803f
@@ -52,17 +52,19 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
tree_data.max_depth=depth;
tree_data.max_pts=M; // Points per octant.

//Set source coordinates and values.
std::vector<Real_t> src_coord, src_value;
src_coord=point_distrib<Real_t>((dist==0?UnifGrid:(dist==1?RandSphr:RandElps)),N,comm);
for(size_t i=0;i<src_coord.size();i++) src_coord[i]*=b;
for(size_t i=0;i<src_coord.size()*mykernel->ker_dim[0]/COORD_DIM;i++) src_value.push_back(drand48());
tree_data.pt_coord=src_coord;
tree_data.src_coord=src_coord;
tree_data.src_value=src_value;

//Set target coordinates.
tree_data.trg_coord=tree_data.src_coord;
{ //Set particle coordinates and values.
std::vector<Real_t> src_coord, src_value;
src_coord=point_distrib<Real_t>((dist==0?UnifGrid:(dist==1?RandSphr:RandElps)),N,comm);
for(size_t i=0;i<src_coord.size();i++) src_coord[i]*=b;
for(size_t i=0;i<src_coord.size()*mykernel->ker_dim[0]/COORD_DIM;i++) src_value.push_back(drand48()-0.5);
tree_data.pt_coord=src_coord;
tree_data.pt_value=src_value;
//tree_data.src_coord=src_coord;
//tree_data.src_value=src_value;

//Set target coordinates.
//tree_data.trg_coord=tree_data.src_coord;
}

//Print various parameters.
if(!myrank){
@@ -93,6 +95,19 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
tree.Initialize(&tree_data);

//Initialize FMM Tree
pvfmm::Profile::Tic("SetSrcTrg",&comm,true);
{ // Set src and trg points
std::vector<FMMNode_t*>& node=tree.GetNodeList();
#pragma omp parallel for
for(size_t i=0;i<node.size();i++){
node[i]-> trg_coord.ReInit(node[i]-> pt_coord.Dim(), &node[i]-> pt_coord[0]);
node[i]-> src_coord.ReInit(node[i]-> pt_coord.Dim(), &node[i]-> pt_coord[0]);
node[i]-> src_value.ReInit(node[i]-> pt_value.Dim(), &node[i]-> pt_value[0]);
node[i]->trg_scatter.ReInit(node[i]->pt_scatter.Dim(), &node[i]->pt_scatter[0]);
node[i]->src_scatter.ReInit(node[i]->pt_scatter.Dim(), &node[i]->pt_scatter[0]);
}
}
pvfmm::Profile::Toc();
tree.InitFMM_Tree(false,bndry);

// Setup FMM
@@ -136,9 +136,9 @@ void Cheb_Node<Real_t>::Subdivide() {
Vector<Real_t> child_cheb_coeff[8];
int n=(1UL<<this->Dim());
for(int i=0;i<n;i++){
Real_t coord[3]={((i )%2?0:-1.0),
((i/2)%2?0:-1.0),
((i/4)%2?0:-1.0)};
Real_t coord[3]={(Real_t)((i )%2?0:-1.0),
(Real_t)((i/2)%2?0:-1.0),
(Real_t)((i/4)%2?0:-1.0)};
for(int j=0;j<=cheb_deg;j++){
x[j]=cheb_node[j]+coord[0];
y[j]=cheb_node[j]+coord[1];
@@ -581,7 +581,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
// Coord of target points
Real_t s=pow(0.5,level);
int* coord=this->interac_list.RelativeCoord(type,mat_indx);
Real_t coord_diff[3]={(coord[0]-1)*s*0.5,(coord[1]-1)*s*0.5,(coord[2]-1)*s*0.5};
Real_t coord_diff[3]={(Real_t)((coord[0]-1)*s*0.5),(Real_t)((coord[1]-1.0)*s*0.5),(Real_t)((coord[2]-1.0)*s*0.5)};
std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
size_t n_trg = rel_trg_coord.size()/3;
std::vector<Real_t> trg_coord(n_trg*3);
@@ -681,7 +681,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
// Coord of target points
Real_t s=pow(0.5,level);
int* coord=this->interac_list.RelativeCoord(type,mat_indx);
Real_t coord_diff[3]={(coord[0]+1)*s*0.25,(coord[1]+1)*s*0.25,(coord[2]+1)*s*0.25};
Real_t coord_diff[3]={(Real_t)((coord[0]+1)*s*0.25),(Real_t)((coord[1]+1)*s*0.25),(Real_t)((coord[2]+1)*s*0.25)};
std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
size_t n_trg = rel_trg_coord.size()/3;
std::vector<Real_t> trg_coord(n_trg*3);
@@ -752,7 +752,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
// Coord of target points
Real_t s=pow(0.5,level-1);
int* coord=this->interac_list.RelativeCoord(type,mat_indx);
Real_t c[3]={-(coord[0]-1)*s*0.25,-(coord[1]-1)*s*0.25,-(coord[2]-1)*s*0.25};
Real_t c[3]={-(Real_t)((coord[0]-1)*s*0.25),-(Real_t)((coord[1]-1)*s*0.25),-(Real_t)((coord[2]-1)*s*0.25)};
std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,level);
size_t n_trg=trg_coord.size()/3;

@@ -825,10 +825,14 @@ void FMM_Cheb<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>&
}
}
}
#pragma omp parallel for
for(size_t i=0;i<node.size();i++){
node[i]->pt_cnt[0]+=2*n_coeff;
node[i]->pt_cnt[1]+=2*n_coeff;
{ // Set pt_cnt
size_t m=this->MultipoleOrder();
size_t Nsrf=(6*(m-1)*(m-1)+2);
#pragma omp parallel for
for(size_t i=0;i<node.size();i++){
node[i]->pt_cnt[0]+=2*Nsrf;
node[i]->pt_cnt[1]+=2*Nsrf;
}
}
FMM_Pts<FMMNode_t>::CollectNodeData(tree, node, buff, n_list, vec_list);
}
@@ -870,6 +874,7 @@ template <class FMMNode>
void FMM_Cheb<FMMNode>::Source2Up (SetupData<Real_t>& setup_data, bool device){
if(!this->MultipoleOrder()) return;
//Add Source2Up contribution.
FMM_Pts<FMMNode>::Source2Up(setup_data, device);
this->EvalList(setup_data, device);
}

@@ -207,6 +207,7 @@ class FMM_Pts{
virtual void CopyOutput(FMMNode** nodes, size_t n);

Vector<char> dev_buffer;
Vector<char> staging_buffer;

protected:

@@ -1606,6 +1606,11 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
if(device){ // Host2Device
Profile::Tic("Host2Device",&this->comm,false,25);
setup_data.interac_data .AllocDevice(true);
if(staging_buffer.Dim()<sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1)){
staging_buffer.ReInit(sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1));
staging_buffer.SetZero();
staging_buffer.AllocDevice(true);
}
Profile::Toc();
}
}
@@ -1994,8 +1999,8 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t*

setup_data.nodes_in .clear();
setup_data.nodes_out.clear();
for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[0] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && (nodes_out[i]->src_coord.Dim() || nodes_out[i]->surf_coord.Dim()) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
}

struct PackedData{
@@ -4122,8 +4127,8 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tr

setup_data.nodes_in .clear();
setup_data.nodes_out.clear();
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf ()) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
}

struct PackedData{
@@ -4424,8 +4429,8 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tr

setup_data.nodes_in .clear();
setup_data.nodes_out.clear();
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
}

struct PackedData{
@@ -4709,8 +4714,8 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tre

setup_data.nodes_in .clear();
setup_data.nodes_out.clear();
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && (nodes_out[i]->trg_coord.Dim() ) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
}

struct PackedData{
@@ -5105,8 +5110,8 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, FMMTree_

setup_data.nodes_in .clear();
setup_data.nodes_out.clear();
for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[1] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->trg_coord.Dim() && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
}

struct PackedData{
@@ -1,33 +1,36 @@
#ifndef _CUDA_FUNC_HPP_
#define _CUDA_FUNC_HPP_

#include <pvfmm_common.hpp>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cstring>
#include <device_wrapper.hpp>
#include <matrix.hpp>
#include <vector.hpp>

#ifdef __cplusplus
extern "C" {
#endif
void in_perm_d(char* precomp_data, double* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
void out_perm_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
void in_perm_gpu_f(char* precomp_data, float * input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
void in_perm_gpu_d(char* precomp_data, double* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);

void out_perm_gpu_f(char* precomp_data, float * output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
void out_perm_gpu_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
#ifdef __cplusplus
}
#endif

template <class Real_t>
void in_perm_gpu(char* precomp_data, Real_t* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
in_perm_d (precomp_data, input_data, buff_in , input_perm, vec_cnt, M_dim0, stream);
};
void in_perm_gpu(char* precomp_data, Real_t* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);

template <class Real_t>
void out_perm_gpu(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
out_perm_d(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
};
void out_perm_gpu(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);

template<> void in_perm_gpu<float >(char* precomp_data, float * input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
in_perm_gpu_f (precomp_data, input_data, buff_in , input_perm, vec_cnt, M_dim0, stream);
}
template<> void in_perm_gpu<double>(char* precomp_data, double* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
in_perm_gpu_d (precomp_data, input_data, buff_in , input_perm, vec_cnt, M_dim0, stream);
}

template<> void out_perm_gpu<float >(char* precomp_data, float * output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
out_perm_gpu_f(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
}
template<> void out_perm_gpu<double>(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
out_perm_gpu_d(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
}

#endif //_CUDA_FUNC_HPP_
Oops, something went wrong.

0 comments on commit 987aa23

Please sign in to comment.