Skip to content

Commit

Permalink
Merge pull request #17 from 3upperm2n/revert-16-fangqq
Browse files Browse the repository at this point in the history
Revert "optimize photon workload at work-group level"
  • Loading branch information
fangq committed Jan 27, 2017
2 parents fb47e8d + aef375d commit 0e2906c
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 22 deletions.
3 changes: 1 addition & 2 deletions src/Makefile
Expand Up @@ -12,8 +12,7 @@ OUTPUT_DIR=../bin
INCLUDEDIRS=#-I/home/fangq/Download/ati-stream-sdk-v2.0-lnx32/include
AMDAPPSDKROOT ?=/opt/AMDAPPSDK-2.9-1
LIBOPENCLDIR ?=$(AMDAPPSDKROOT)/lib/x86_64
CUDAOPENCLDIR ?=/usr/local/cuda/lib64
LINKOPT=-g -L$(LIBOPENCLDIR) -L$(CUDAOPENCLDIR) -lOpenCL
LINKOPT=-g -L$(LIBOPENCLDIR) -lOpenCL

CUCCOPT=-I/usr/local/cuda/include #-m32 -msse2 -Wfloat-equal -Wpointer-arith -DATI_OS_LINUX -g3 -ffor-scope
CCOPT=-g -pedantic -Wall -O3 -DMCX_OPENCL -DUSE_OS_TIMER -I/usr/local/cuda/include -std=c99#-O3
Expand Down
19 changes: 5 additions & 14 deletions src/mcx_core.cl
Expand Up @@ -287,7 +287,7 @@ void rotatevector(float4 v[], float stheta, float ctheta, float sphi, float cphi
int launchnewphoton(float4 p[],float4 v[],float4 f[],float4 prop[],uint *idx1d,
uint *mediaid,float *w0,uchar isdet, __local float ppath[],float *energyloss,float *energylaunched,
__global float n_det[],__global uint *dpnum, __constant float4 gproperty[],
__constant float4 gdetpos[],__constant MCXParam gcfg[],int threadid, int threadphoton, int oddphotons, __local int *photons_per_blk){
__constant float4 gdetpos[],__constant MCXParam gcfg[],int threadid, int threadphoton, int oddphotons){

if(p[0].w>=0.f){
*energyloss+=p[0].w; // sum all the remaining energy
Expand All @@ -302,10 +302,6 @@ int launchnewphoton(float4 p[],float4 v[],float4 f[],float4 prop[],uint *idx1d,
#endif
}

if(photons_per_blk[0] <= 0)
return 1;
atomic_sub(photons_per_blk, 1);

if(f[0].w>=(threadphoton+(threadid<oddphotons)))
return 1; // all photons complete
p[0]=gcfg->ps;
Expand All @@ -328,8 +324,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
__constant float4 gdetpos[], __global uint stopsign[1],__global uint detectedphoton[1],
__local float *sharedmem, __constant MCXParam gcfg[]){

int idx=get_global_id(0);
int lid=get_local_id(0);
int idx= get_global_id(0);

float4 p={0.f,0.f,0.f,-1.f}; //{x,y,z}: x,y,z coordinates,{w}:packet weight
float4 v=gcfg->c0; //{x,y,z}: ix,iy,iz unitary direction vector, {w}:total scat event
Expand All @@ -355,18 +350,14 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const

__local float *ppath=sharedmem+get_local_id(0)*gcfg->maxmedia;

__local int photons_per_blk[1];
if(lid == 0) photons_per_blk[0]=nphoton;
barrier(CLK_LOCAL_MEM_FENCE);

#ifdef MCX_SAVE_DETECTORS
if(gcfg->savedet) clearpath(ppath,gcfg);
#endif

gpu_rng_init(t,n_seed,idx);

if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,0,ppath,
&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){
&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){
n_seed[idx]=NO_LAUNCH;
return;
}
Expand Down Expand Up @@ -473,7 +464,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
if((mediaid==0 && (!gcfg->doreflect || (gcfg->doreflect && n1==gproperty[mediaid].w))) || f.y>gcfg->twin1){
GPUDEBUG(((__constant char*)"direct relaunch at idx=[%d] mediaid=[%d], ref=[%d]\n",idx1d,mediaid,gcfg->doreflect));
if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,(mediaidold & DET_MASK),ppath,
&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){
&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){
break;
}
continue;
Expand Down Expand Up @@ -505,7 +496,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
if(mediaid==0){ // transmission to external boundary
GPUDEBUG(((__constant char*)"transmit to air, relaunch\n"));
if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,(mediaidold & DET_MASK),
ppath,&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){
ppath,&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){
break;
}
continue;
Expand Down
8 changes: 2 additions & 6 deletions src/mcx_host.cpp
Expand Up @@ -423,21 +423,17 @@ void mcx_run_simulation(Config *cfg,float *fluence,float *totalenergy){

mcxkernel=(cl_kernel*)malloc(workdev*sizeof(cl_kernel));

int photons_per_blk;
for(i=0;i<workdev;i++){
cl_int threadphoton, oddphotons;

int blocks = ((gpu[i].autothread + gpu[i].autoblock - 1 ) / gpu[i].autoblock);
photons_per_blk = ((cfg->nphoton + blocks - 1 ) / blocks);

threadphoton=(int)(cfg->nphoton*cfg->workload[i]/(fullload*gpu[i].autothread*cfg->respin));
oddphotons=(int)(cfg->nphoton*cfg->workload[i]/(fullload*cfg->respin)-threadphoton*gpu[i].autothread);
fprintf(cfg->flog,"- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d repetition=%d\n",i, gpu[i].id, gpu[i].name,threadphoton,oddphotons,
cfg->nphoton*cfg->workload[i]/fullload,(int)gpu[i].autothread,cfg->respin);

OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mcx_main_loop", &status),status)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_int), (void*)&photons_per_blk)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint),(void*)&oddphotons)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint),(void*)&threadphoton)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint),(void*)&oddphotons)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)&gmedia)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sizeof(cl_mem), (void*)(gfield+i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(genergy+i))));
Expand Down

0 comments on commit 0e2906c

Please sign in to comment.