Merge pull request #17 from 3upperm2n/revert-16-fangqq

Revert "optimize photon workload at work-group level"
fangq · Jan 27, 2017 · 0e2906c · 0e2906c
2 parents fb47e8d + aef375d
commit 0e2906c
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 22 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -12,8 +12,7 @@ OUTPUT_DIR=../bin
 INCLUDEDIRS=#-I/home/fangq/Download/ati-stream-sdk-v2.0-lnx32/include
 AMDAPPSDKROOT ?=/opt/AMDAPPSDK-2.9-1
 LIBOPENCLDIR ?=$(AMDAPPSDKROOT)/lib/x86_64
-CUDAOPENCLDIR ?=/usr/local/cuda/lib64
-LINKOPT=-g -L$(LIBOPENCLDIR) -L$(CUDAOPENCLDIR) -lOpenCL
+LINKOPT=-g -L$(LIBOPENCLDIR) -lOpenCL
 
 CUCCOPT=-I/usr/local/cuda/include #-m32 -msse2 -Wfloat-equal -Wpointer-arith  -DATI_OS_LINUX -g3 -ffor-scope 
 CCOPT=-g -pedantic -Wall -O3 -DMCX_OPENCL -DUSE_OS_TIMER -I/usr/local/cuda/include  -std=c99#-O3

diff --git a/src/mcx_core.cl b/src/mcx_core.cl
@@ -287,7 +287,7 @@ void rotatevector(float4 v[], float stheta, float ctheta, float sphi, float cphi
 int launchnewphoton(float4 p[],float4 v[],float4 f[],float4 prop[],uint *idx1d,
            uint *mediaid,float *w0,uchar isdet, __local float ppath[],float *energyloss,float *energylaunched,
 	   __global float n_det[],__global uint *dpnum, __constant float4 gproperty[],
-	   __constant float4 gdetpos[],__constant MCXParam gcfg[],int threadid, int threadphoton, int oddphotons, __local int *photons_per_blk){
+	   __constant float4 gdetpos[],__constant MCXParam gcfg[],int threadid, int threadphoton, int oddphotons){
 
       if(p[0].w>=0.f){
           *energyloss+=p[0].w;  // sum all the remaining energy
@@ -302,10 +302,6 @@ int launchnewphoton(float4 p[],float4 v[],float4 f[],float4 prop[],uint *idx1d,
 #endif
       }
 
-	  if(photons_per_blk[0] <= 0)
-		  return 1;
-	  atomic_sub(photons_per_blk, 1);
-
       if(f[0].w>=(threadphoton+(threadid<oddphotons)))
          return 1; // all photons complete 
       p[0]=gcfg->ps;
@@ -328,8 +324,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
      __constant float4 gdetpos[], __global uint stopsign[1],__global uint detectedphoton[1],
      __local float *sharedmem, __constant MCXParam gcfg[]){
 
-     int idx=get_global_id(0);
-	 int lid=get_local_id(0);
+     int idx= get_global_id(0);
 
      float4 p={0.f,0.f,0.f,-1.f};  //{x,y,z}: x,y,z coordinates,{w}:packet weight
      float4 v=gcfg->c0;  //{x,y,z}: ix,iy,iz unitary direction vector, {w}:total scat event
@@ -355,18 +350,14 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
 
      __local float *ppath=sharedmem+get_local_id(0)*gcfg->maxmedia;
 
-     __local int photons_per_blk[1]; 
-	 if(lid == 0) photons_per_blk[0]=nphoton;	
-	 barrier(CLK_LOCAL_MEM_FENCE);
-
 #ifdef  MCX_SAVE_DETECTORS
      if(gcfg->savedet) clearpath(ppath,gcfg);
 #endif
 
      gpu_rng_init(t,n_seed,idx);
 
      if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,0,ppath,
-		      &energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){
+		      &energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){
          n_seed[idx]=NO_LAUNCH;
          return;
      }
@@ -473,7 +464,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
           if((mediaid==0 && (!gcfg->doreflect || (gcfg->doreflect && n1==gproperty[mediaid].w))) || f.y>gcfg->twin1){
                   GPUDEBUG(((__constant char*)"direct relaunch at idx=[%d] mediaid=[%d], ref=[%d]\n",idx1d,mediaid,gcfg->doreflect));
 		  if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,(mediaidold & DET_MASK),ppath,
-		      &energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){ 
+		      &energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){ 
                          break;
 		  }
                   continue;
@@ -505,7 +496,7 @@ __kernel void mcx_main_loop(const int nphoton, const int ophoton,__global const
                         if(mediaid==0){ // transmission to external boundary
                             GPUDEBUG(((__constant char*)"transmit to air, relaunch\n"));
 		    	    if(launchnewphoton(&p,&v,&f,&prop,&idx1d,&mediaid,&w0,(mediaidold & DET_MASK),
-			        ppath,&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton, &photons_per_blk[0])){
+			        ppath,&energyloss,&energylaunched,n_det,detectedphoton,gproperty,gdetpos,gcfg,idx,nphoton,ophoton)){
                                     break;
 			    }
 			    continue;

diff --git a/src/mcx_host.cpp b/src/mcx_host.cpp
@@ -423,21 +423,17 @@ void mcx_run_simulation(Config *cfg,float *fluence,float *totalenergy){
 
      mcxkernel=(cl_kernel*)malloc(workdev*sizeof(cl_kernel));
 
-	 int photons_per_blk;
      for(i=0;i<workdev;i++){
          cl_int threadphoton, oddphotons;
 
-		 int blocks = ((gpu[i].autothread + gpu[i].autoblock - 1 ) / gpu[i].autoblock);
-		 photons_per_blk = ((cfg->nphoton + blocks - 1 ) /  blocks);
-
          threadphoton=(int)(cfg->nphoton*cfg->workload[i]/(fullload*gpu[i].autothread*cfg->respin));
          oddphotons=(int)(cfg->nphoton*cfg->workload[i]/(fullload*cfg->respin)-threadphoton*gpu[i].autothread);
          fprintf(cfg->flog,"- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d repetition=%d\n",i, gpu[i].id, gpu[i].name,threadphoton,oddphotons,
                cfg->nphoton*cfg->workload[i]/fullload,(int)gpu[i].autothread,cfg->respin);
 
 	 OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mcx_main_loop", &status),status)));
-	 OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_int), (void*)&photons_per_blk)));
-     OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint),(void*)&oddphotons)));
+	 OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint),(void*)&threadphoton)));
+         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint),(void*)&oddphotons)));
 	 OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)&gmedia)));
 	 OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sizeof(cl_mem), (void*)(gfield+i))));
 	 OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(genergy+i))));