From 29ea4261ff906b713b0b35a380300118747e6c52 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Sun, 28 Feb 2016 18:11:34 -0500 Subject: [PATCH] speed up hitgrid with mcx_nextafter and reciprocal of c, manually resync with sourceforge svn --- src/Makefile | 7 +++- src/mcx_core.cu | 99 +++++++++++++++++++++++++++---------------------- 2 files changed, 61 insertions(+), 45 deletions(-) diff --git a/src/Makefile b/src/Makefile index a7fe2534..ad095f1f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,6 +10,8 @@ ifeq ($(BACKEND),ocelot) CC?=g++ endif +MKDIR := mkdir + CUDACC=nvcc AR=$(CC) @@ -106,7 +108,10 @@ OBJS := $(addsuffix $(OBJSUFFIX), $(FILES)) all mt fast log logfast racing mtatomic logatomic mtbox logbox debugmt debuglog \ det detbox fermi mex oct mexbox octbox fermimex fermioct: cudasdk $(OUTPUT_DIR)/$(BINARY) -$(OUTPUT_DIR)/$(BINARY): $(OBJS) +makedirs: + @if test ! -d $(OUTPUT_DIR); then $(MKDIR) $(OUTPUT_DIR); fi + +$(OUTPUT_DIR)/$(BINARY): makedirs $(OBJS) $(AR) $(OBJS) -o $(OUTPUT_DIR)/$(BINARY) $(LINKOPT) %$(OBJSUFFIX): %.c diff --git a/src/mcx_core.cu b/src/mcx_core.cu index 24c0d196..2c32e10a 100644 --- a/src/mcx_core.cu +++ b/src/mcx_core.cu @@ -122,31 +122,40 @@ __device__ inline void savedetphoton(float n_det[],uint *detectedphoton,float ns } #endif -__device__ inline float hitgrid(float3 *p0, float3 *v, float3 *htime,int *id){ +__device__ inline float mcx_nextafterf(float a, int dir){ + union{ + float f; + uint i; + } num; + num.f=a+gcfg->maxvoidstep; + num.i+=dir ^ (num.i & 0x80000000U); + return num.f-gcfg->maxvoidstep; +} + +__device__ inline float hitgrid(float3 *p0, float3 *v, float *htime,float* rv,int *id){ float dist; //time-of-flight to hit the wall in each direction - htime->x=fabs(__fdividef(floorf(p0->x)+(v->x>0.f)-p0->x,v->x)); - htime->y=fabs(__fdividef(floorf(p0->y)+(v->y>0.f)-p0->y,v->y)); - htime->z=fabs(__fdividef(floorf(p0->z)+(v->z>0.f)-p0->z,v->z)); + htime[0]=(floorf(p0->x)+(v->x>0.f)-p0->x)*rv[0]; // absolute distance of travel in x/y/z + htime[1]=(floorf(p0->y)+(v->y>0.f)-p0->y)*rv[1]; + htime[2]=(floorf(p0->z)+(v->z>0.f)-p0->z)*rv[2]; //get the direction with the smallest time-of-flight - dist=fminf(fminf(htime->x,htime->y),htime->z); - (*id)=(dist==htime->x?0:(dist==htime->y?1:2)); + dist=fminf(fminf(htime[0],htime[1]),htime[2]); + (*id)=(dist==htime[0]?0:(dist==htime[1]?1:2)); //p0 is inside, p is outside, move to the 1st intersection pt, now in the air side, to be corrected in the else block - htime->x=p0->x+dist*v->x; - htime->y=p0->y+dist*v->y; - htime->z=p0->z+dist*v->z; + htime[0]=p0->x+dist*v->x; + htime[1]=p0->y+dist*v->y; + htime[2]=p0->z+dist*v->z; - // make sure photon crosses the boundary (*id==0) ? - (htime->x=nextafterf(__float2int_rn(htime->x), htime->x+(v->x > 0.f)-0.5f)) : + (htime[0]=mcx_nextafterf(__float2int_rn(htime[0]), (v->x > 0.f)-(v->x < 0.f))) : ((*id==1) ? - (htime->y=nextafterf(__float2int_rn(htime->y), htime->y+(v->y > 0.f)-0.5f)) : - (htime->z=nextafterf(__float2int_rn(htime->z), htime->z+(v->z > 0.f)-0.5f)) ); + (htime[1]=mcx_nextafterf(__float2int_rn(htime[1]), (v->y > 0.f)-(v->y < 0.f))) : + (htime[2]=mcx_nextafterf(__float2int_rn(htime[2]), (v->z > 0.f)-(v->z < 0.f))) ); - return fabs(dist); + return dist; } __device__ inline void transmit(MCXdir *v, float n1, float n2,int flipdir){ @@ -183,7 +192,7 @@ __device__ inline float reflectcoeff(MCXdir *v, float n1, float n2, int flipdir) /* if the source location is outside of the volume or in an void voxel, mcx advances the photon in v.{xyz} direction until it hits an non-zero voxel */ -__device__ inline int skipvoid(MCXpos *p,MCXdir *v,MCXtime *f,uchar media[]){ +__device__ inline int skipvoid(MCXpos *p,MCXdir *v,MCXtime *f,float3* rv,uchar media[]){ int count=1,idx1d; while(1){ if(p->x>=0.f && p->y>=0.f && p->z>=0.f && p->x < gcfg->maxidx.x @@ -203,7 +212,7 @@ __device__ inline int skipvoid(MCXpos *p,MCXdir *v,MCXtime *f,uchar media[]){ count=0; while(!(p->x>=0.f && p->y>=0.f && p->z>=0.f && p->x < gcfg->maxidx.x && p->y < gcfg->maxidx.y && p->z < gcfg->maxidx.z) || !media[idx1d]){ // at most 3 times - f->t+=gcfg->minaccumtime*hitgrid((float3*)p,(float3*)v,&htime,&flipdir); + f->t+=gcfg->minaccumtime*hitgrid((float3*)p,(float3*)v,&htime.x,&rv->x,&flipdir); *((float4*)(p))=float4(htime.x,htime.y,htime.z,p->w); idx1d=(int(floorf(p->z))*gcfg->dimlen.y+int(floorf(p->y))*gcfg->dimlen.x+int(floorf(p->x))); GPUDEBUG(("entry p=[%f %f %f]\n",p->x,p->y,p->z)); @@ -255,7 +264,7 @@ __device__ inline void rotatevector(MCXdir *v, float stheta, float ctheta, float GPUDEBUG(("new dir: %10.5e %10.5e %10.5e\n",v->x,v->y,v->z)); } -__device__ inline int launchnewphoton(MCXpos *p,MCXdir *v,MCXtime *f,Medium *prop,uint *idx1d, +__device__ inline int launchnewphoton(MCXpos *p,MCXdir *v,MCXtime *f,float3* rv,Medium *prop,uint *idx1d, uchar *mediaid,float *w0,float *Lmove,uchar isdet, float ppath[],float energyloss[],float energylaunched[],float n_det[],uint *dpnum, RandType t[RAND_BUF_LEN],RandType tnew[RAND_BUF_LEN],RandType photonseed[RAND_BUF_LEN], uchar media[],float srcpattern[],int threadid,RandType rngseed[],RandType seeddata[]){ @@ -425,7 +434,7 @@ __device__ inline int launchnewphoton(MCXpos *p,MCXdir *v,MCXtime *f,Medium *pro } } if((*mediaid & MED_MASK)==0){ - int idx=skipvoid(p, v, f, media); /*specular reflection of the bbx is taken care of here*/ + int idx=skipvoid(p, v, f, rv, media); /*specular reflection of the bbx is taken care of here*/ if(idx>=0){ *idx1d=idx; *mediaid=media[*idx1d]; @@ -445,6 +454,7 @@ __device__ inline int launchnewphoton(MCXpos *p,MCXdir *v,MCXtime *f,Medium *pro *energylaunched+=p->w; *w0=p->w; *Lmove=0.f; + *rv=float3(__fdividef(1.f,v->x),__fdividef(1.f,v->y),__fdividef(1.f,v->z)); return 0; } @@ -478,9 +488,8 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see MCXdir *v=(MCXdir*)(sharedmem+(threadIdx.x<<2)); //{x,y,z}: unitary direction vector in grid unit, nscat:total scat event MCXtime f; //pscat: remaining scattering probability,t: photon elapse time, //tnext: next accumulation time, ndone: completed photons - float energyloss=genergy[idx*3]; - float energyabsorbed=genergy[idx*3+1]; - float energylaunched=genergy[idx*3+2]; + float energyloss=genergy[idx<<1]; + float energylaunched=genergy[(idx<<1)+1]; uint idx1d, idx1dold; //idx1dold is related to reflection uint moves=0; @@ -491,7 +500,8 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see uchar mediaid=gcfg->mediaidorig; uchar mediaidold=0; float n1; //reflection var - float3 htime; //reflection var + float3 htime; //time-of-fly for collision test + float3 rv; //reciprocal velocity //for MT RNG, these will be zero-length arrays and be optimized out RandType *t=(RandType*)(sharedmem+(blockDim.x<<2)+threadIdx.x*RAND_BUF_LEN); @@ -522,7 +532,7 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see gpu_rng_init(t,tnew,n_seed,idx); - if(launchnewphoton(&p,v,&f,&prop,&idx1d,&mediaid,&w0,&Lmove,0,ppath,&energyloss, + if(launchnewphoton(&p,v,&f,&rv,&prop,&idx1d,&mediaid,&w0,&Lmove,0,ppath,&energyloss, &energylaunched,n_det,detectedphoton,t,tnew,photonseed,media,srcpattern, idx,(RandType*)n_seed,seeddata)){ n_seed[idx]=NO_LAUNCH; @@ -531,6 +541,7 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see n_len[idx]=*((float4*)(&f)); return; } + rv=float3(__fdividef(1.f,v->x),__fdividef(1.f,v->y),__fdividef(1.f,v->z)); /* using a while-loop to terminate a thread by np.will cause MT RNG to be 3.5x slower @@ -583,13 +594,14 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see GPUDEBUG(("scat theta=%f\n",theta)); rotatevector(v,stheta,ctheta,sphi,cphi); v->nscat++; + rv=float3(__fdividef(1.f,v->x),__fdividef(1.f,v->y),__fdividef(1.f,v->z)); } } n1=prop.n; *((float4*)(&prop))=gproperty[mediaid & MED_MASK]; - len=(gcfg->faststep) ? gcfg->minstep : hitgrid((float3*)&p,(float3*)v,&htime,&flipdir); // propagate the photon to the first intersection to the grid + len=(gcfg->faststep) ? gcfg->minstep : hitgrid((float3*)&p,(float3*)v,&(htime.x),&rv.x,&flipdir); // propagate the photon to the first intersection to the grid slen=len*prop.mus; //unitless (minstep=grid, mus=1/grid) GPUDEBUG(("p=[%f %f %f] -> <%f %f %f>*%f -> hit=[%f %f %f] flip=%d\n",p.x,p.y,p.z,v->x,v->y,v->z,len,htime.x,htime.y,htime.z,flipdir)); @@ -638,7 +650,6 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see GPUDEBUG(("deposit to [%d] %e, w=%f\n",idx1dold,weight,p.w)); - energyabsorbed+=w0-p.w; #ifdef TEST_RACING // enable TEST_RACING to determine how many missing accumulations due to race if( (p.x-gcfg->ps.x)*(p.x-gcfg->ps.x)+(p.y-gcfg->ps.y)*(p.y-gcfg->ps.y)+(p.z-gcfg->ps.z)*(p.z-gcfg->ps.z)>gcfg->skipradius2) { @@ -685,7 +696,7 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see if((mediaid==0 && (!gcfg->doreflect || (gcfg->doreflect && n1==gproperty[mediaid].w))) || f.t>gcfg->twin1){ GPUDEBUG(("direct relaunch at idx=[%d] mediaid=[%d], ref=[%d]\n",idx1d,mediaid,gcfg->doreflect)); - if(launchnewphoton(&p,v,&f,&prop,&idx1d,&mediaid,&w0,&Lmove,(mediaidold & DET_MASK),ppath, + if(launchnewphoton(&p,v,&f,&rv,&prop,&idx1d,&mediaid,&w0,&Lmove,(mediaidold & DET_MASK),ppath, &energyloss,&energylaunched,n_det,detectedphoton,t,tnew,photonseed,media,srcpattern,idx,(RandType*)n_seed,seeddata)) break; continue; @@ -718,7 +729,7 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see if(Rtotal<1.f && rand_next_reflect(t,tnew)>Rtotal){ // do transmission if(mediaid==0){ // transmission to external boundary GPUDEBUG(("transmit to air, relaunch\n")); - if(launchnewphoton(&p,v,&f,&prop,&idx1d,&mediaid,&w0,&Lmove,(mediaidold & DET_MASK), + if(launchnewphoton(&p,v,&f,&rv,&prop,&idx1d,&mediaid,&w0,&Lmove,(mediaidold & DET_MASK), ppath,&energyloss,&energylaunched,n_det,detectedphoton,t,tnew,photonseed, media,srcpattern,idx,(RandType*)n_seed,seeddata)) break; @@ -726,14 +737,16 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see } GPUDEBUG(("do transmission\n")); transmit(v,n1,prop.n,flipdir); + rv=float3(__fdividef(1.f,v->x),__fdividef(1.f,v->y),__fdividef(1.f,v->z)); }else{ //do reflection GPUDEBUG(("ref faceid=%d p=[%f %f %f] v_old=[%f %f %f]\n",flipdir,p.x,p.y,p.z,v->x,v->y,v->z)); (flipdir==0) ? (v->x=-v->x) : ((flipdir==1) ? (v->y=-v->y) : (v->z=-v->z)) ; + rv=float3(__fdividef(1.f,v->x),__fdividef(1.f,v->y),__fdividef(1.f,v->z)); (flipdir==0) ? - (p.x=nextafterf(__float2int_rn(p.x), p.x+(v->x > 0.f)-0.5f)) : + (p.x=mcx_nextafterf(__float2int_rn(p.x), (v->x > 0.f)-(v->x < 0.f))) : ((flipdir==1) ? - (p.y=nextafterf(__float2int_rn(p.y), p.y+(v->y > 0.f)-0.5f)) : - (p.z=nextafterf(__float2int_rn(p.z), p.z+(v->z > 0.f)-0.5f)) ); + (p.y=mcx_nextafterf(__float2int_rn(p.y), (v->x > 0.f)-(v->x < 0.f))) : + (p.z=mcx_nextafterf(__float2int_rn(p.z), (v->x > 0.f)-(v->x < 0.f))) ); GPUDEBUG(("ref p_new=[%f %f %f] v_new=[%f %f %f]\n",p.x,p.y,p.z,v->x,v->y,v->z)); idx1d=idx1dold; mediaid=(media[idx1d] & MED_MASK); @@ -755,9 +768,8 @@ kernel void mcx_main_loop(uchar media[],float field[],float genergy[],uint n_see f.tnext=accumweight; #endif - genergy[idx*3]=energyloss; - genergy[idx*3+1]=energyabsorbed; - genergy[idx*3+2]=energylaunched; + genergy[idx<<1]=energyloss; + genergy[(idx<<1)+1]=energylaunched; #ifdef TEST_RACING n_seed[idx]=cc; @@ -1064,7 +1076,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){ Pdir=(float4*)malloc(sizeof(float4)*gpu[gpuid].autothread); Plen=(float4*)malloc(sizeof(float4)*gpu[gpuid].autothread); Plen0=(float4*)malloc(sizeof(float4)*gpu[gpuid].autothread); - energy=(float*)calloc(gpu[gpuid].autothread*3,sizeof(float)); + energy=(float*)calloc(gpu[gpuid].autothread<<1,sizeof(float)); Pdet=(float*)calloc(cfg->maxdetphoton,sizeof(float)*(detreclen)); Pseed=(uint*)malloc(sizeof(uint)*gpu[gpuid].autothread*RAND_SEED_LEN); @@ -1076,7 +1088,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){ mcx_cu_assess(cudaMalloc((void **) &gPlen, sizeof(float4)*gpu[gpuid].autothread),__FILE__,__LINE__); mcx_cu_assess(cudaMalloc((void **) &gPdet, sizeof(float)*cfg->maxdetphoton*(detreclen)),__FILE__,__LINE__); mcx_cu_assess(cudaMalloc((void **) &gdetected, sizeof(uint)),__FILE__,__LINE__); - mcx_cu_assess(cudaMalloc((void **) &genergy, sizeof(float)*gpu[gpuid].autothread*3),__FILE__,__LINE__); + mcx_cu_assess(cudaMalloc((void **) &genergy, sizeof(float)*(gpu[gpuid].autothread<<1)),__FILE__,__LINE__); if(cfg->issaveseed){ seeddata=(RandType*)malloc(sizeof(RandType)*cfg->maxdetphoton*RAND_SEED_LEN); mcx_cu_assess(cudaMalloc((void **) &gseeddata, sizeof(RandType)*cfg->maxdetphoton*RAND_SEED_LEN),__FILE__,__LINE__); @@ -1163,7 +1175,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){ fflush(cfg->flog); cudaMemcpy(gmedia, media, sizeof(uchar) *dimxyz, cudaMemcpyHostToDevice); - cudaMemcpy(genergy,energy,sizeof(float) *gpu[gpuid].autothread*3, cudaMemcpyHostToDevice); + cudaMemcpy(genergy,energy,sizeof(float) *(gpu[gpuid].autothread<<1), cudaMemcpyHostToDevice); if(cfg->srcpattern) cudaMemcpy(gsrcpattern,cfg->srcpattern,sizeof(float)*(int)(cfg->srcparam1.w*cfg->srcparam2.w), cudaMemcpyHostToDevice); @@ -1289,12 +1301,11 @@ is more than what your have specified (%d), please use the -H option to specify memcpy(field,field+fieldlen,sizeof(float)*fieldlen); if(cfg->isnormalized){ - cudaMemcpy(energy,genergy,sizeof(float)*gpu[gpuid].autothread*3,cudaMemcpyDeviceToHost); + cudaMemcpy(energy,genergy,sizeof(float)*(gpu[gpuid].autothread<<1),cudaMemcpyDeviceToHost); #pragma omp critical for(i=0;ienergyesc+=energy[3*i]; - cfg->energyabs+=energy[3*i+1]; - cfg->energytot+=energy[3*i+2]; + cfg->energyesc+=energy[i<<1]; + cfg->energytot+=energy[(i<<1)+1]; } for(i=0;ienergyabs+=Plen0[i].z; // the accumulative absorpted energy near the source @@ -1308,7 +1319,7 @@ is more than what your have specified (%d), please use the -H option to specify } if(param.twin1tend){ - cudaMemset(genergy,0,sizeof(float)*gpu[gpuid].autothread*3); + cudaMemset(genergy,0,sizeof(float)*(gpu[gpuid].autothread<<1)); } } /*end of time-gate group loop*/ #pragma omp barrier @@ -1319,9 +1330,9 @@ is more than what your have specified (%d), please use the -H option to specify if(cfg->isnormalized){ float scale; MCX_FPRINTF(cfg->flog,"normalizing raw data ...\t"); - + cfg->energyabs+=cfg->energytot-cfg->energyesc; if(cfg->outputtype==otFlux || cfg->outputtype==otFluence){ - scale=(cfg->energytot-cfg->energyesc)/(cfg->energytot*Vvox*cfg->tstep*cfg->energyabs); + scale=1.f/(cfg->energytot*Vvox*cfg->tstep); if(cfg->unitinmm!=1.f) scale*=cfg->unitinmm; /* Vvox (in mm^3 already) * (Tstep) * (Eabsorp/U) */ @@ -1355,7 +1366,7 @@ is more than what your have specified (%d), please use the -H option to specify cudaMemcpy(Pdir, gPdir, sizeof(float4)*gpu[gpuid].autothread, cudaMemcpyDeviceToHost); cudaMemcpy(Plen, gPlen, sizeof(float4)*gpu[gpuid].autothread, cudaMemcpyDeviceToHost); cudaMemcpy(Pseed, gPseed,sizeof(uint) *gpu[gpuid].autothread*RAND_SEED_LEN, cudaMemcpyDeviceToHost); - cudaMemcpy(energy,genergy,sizeof(float)*gpu[gpuid].autothread*3,cudaMemcpyDeviceToHost); + cudaMemcpy(energy,genergy,sizeof(float)*(gpu[gpuid].autothread<<1),cudaMemcpyDeviceToHost); #ifdef TEST_RACING {