multiple GPU support! set fermi as default, set atomic as default, be…

…tter mex compilation git-svn-id: https://svn.code.sf.net/p/mcx/svn/mcextreme_cuda/trunk@338 373d029a-b463-0410-adc7-be649bff2a08
fangq · Jun 25, 2015 · 60238c4 · 60238c4
1 parent 6815577
commit 60238c4
Show file tree

Hide file tree

Showing 9 changed files with 472 additions and 230 deletions.
diff --git a/mcxlab/examples/demo_mcxlab_basic.m b/mcxlab/examples/demo_mcxlab_basic.m
@@ -6,11 +6,13 @@
 % This file is part of Monte Carlo eXtreme (MCX) URL:http://mcx.sf.net
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+clear cfg cfgs
 cfg.nphoton=1e7;
 cfg.vol=uint8(ones(60,60,60));
 cfg.srcpos=[30 30 1];
 cfg.srcdir=[0 0 1];
 cfg.gpuid=1;
+% cfg.gpuid='11'; % use two GPUs together
 cfg.autopilot=1;
 cfg.prop=[0 0 1 1;0.005 1 0 1.37];
 cfg.tstart=0;
@@ -23,7 +25,6 @@
 cfgs(2)=cfg;
 cfgs(1).isreflect=0;
 cfgs(2).isreflect=1;
-cfgs(2).issavedet=1;
 cfgs(2).detpos=[30 20 1 1;30 40 1 1;20 30 1 1;40 30 1 1];
 % calculate the flux and partial path lengths for the two configurations
 [fluxs,detps]=mcxlab(cfgs);

diff --git a/mcxlab/mcxlab.m b/mcxlab/mcxlab.m
@@ -3,7 +3,7 @@
 %====================================================================
 %      MCXLAB - Monte Carlo eXtreme (MCX) for MATLAB/GNU Octave
 %--------------------------------------------------------------------
-%Copyright (c) 2010-2014 Qianqian Fang <fangq at nmr.mgh.harvard.edu>
+%Copyright (c) 2010-2015 Qianqian Fang <fangq at nmr.mgh.harvard.edu>
 %                      URL: http://mcx.sf.net
 %====================================================================
 %
@@ -41,6 +41,15 @@
 %      cfg.detpos:     an N by 4 array, each row specifying a detector: [x,y,z,radius]
 %      cfg.respin:     repeat simulation for the given time (integer) [1]
 %      cfg.gpuid:      which GPU to use (run 'mcx -L' to list all GPUs) [1]
+%                      if set to an integer, gpuid specifies the index (starts at 1)
+%                      of the GPU for the simulation; if set to a binary string made
+%                      of 1s and 0s, it enables multiple GPUs. For example, '1101'
+%                      allows to use the 1st, 2nd and 4th GPUs together.
+%      cfg.workload    an array denoting the relative loads of each selected GPU. 
+%                      for example, [50,20,30] allocates 50%, 20% and 30% photons to the
+%                      3 selected GPUs, respectively; [10,10] evenly divides the load 
+%                      between 2 active GPUs. A simple load balancing strategy is to 
+%                      use the GPU core counts as the weight.
 %      cfg.isreflect:  [1]-consider refractive index mismatch, 0-matched index
 %      cfg.isrefint:   1-ref. index mismatch at inner boundaries, [0]-matched index
 %      cfg.isnormalized:[1]-normalize the output flux to unitary source, 0-no reflection
@@ -108,9 +117,6 @@
 %            a byte array (uint8) for each detected photon. The column number
 %            of seed equals that of detphoton.
 %
-%      if detphoton is ignored, the detected photon will be saved in a .mch file 
-%      if cfg.issavedeet=1; if no output is given, the flux will be saved to a 
-%      .mc2 file if cfg.issave2pt=1 (which is true by default).
 %
 % Example:
 %      cfg.nphoton=1e7;

diff --git a/src/Makefile b/src/Makefile
@@ -20,15 +20,15 @@ INCLUDEDIRS=-I/usr/local/cuda/include
 
 ifeq ($(BACKEND),ocelot)
   LINKOPT=-L/usr/local/lib `OcelotConfig -l`
-  CUCCOPT=-arch compute_20 #--maxrregcount 32
+  CUCCOPT=-g -arch compute_20 #--maxrregcount 32
 else
   LINKOPT=-L/usr/local/cuda/lib -lcudart -lm
-  CUCCOPT=#-arch compute_20 #--maxrregcount 32
+  CUCCOPT=-g #-arch compute_20 #--maxrregcount 32
 endif
 
 DLLFLAG=-fPIC
 
-CPPOPT=-g -Wall -O3 -std=c99
+CPPOPT=-g -Wall -O3 -std=c99 # -DUSE_OS_TIMER
 
 OBJSUFFIX=.o
 EXESUFFIX=
@@ -41,14 +41,14 @@ ifeq ($(findstring MINGW32,$(PLATFORM)), MINGW32)
   CC=nvcc
   LINKOPT=-L/c/CUDA/lib -lcudart --link
   INCLUDEDIRS +=-I/c/CUDA/include
-  CPPOPT =-c  -D_CRT_SECURE_NO_DEPRECATE -DWIN32
+  CPPOPT =-c -D_CRT_SECURE_NO_DEPRECATE -DWIN32
   OBJSUFFIX=.obj
   EXESUFFIX=.exe
   DLLFLAG=
 else
   ifeq ($(findstring x86_64,$(ARCH)), x86_64)
-     CPPOPT += -m64
-     CUCCOPT += -m64
+     CPPOPT +=-m64
+     CUCCOPT +=-m64
      ifeq "$(wildcard /usr/local/cuda/lib64)" "/usr/local/cuda/lib64"
         ifeq ($(BACKEND),cuda)
            LINKOPT=-L/usr/local/cuda/lib64 -lcudart -lm -lstdc++
@@ -57,6 +57,9 @@ else
   endif
 endif
 
+CUCCOPT+=-Xcompiler -fopenmp
+CPPOPT+=-fopenmp
+
 all logfast:CUCCOPT+=-use_fast_math
 mt:         CUCCOPT+=-DUSE_MT_RAND
 fast:       CUCCOPT+=-DUSE_MT_RAND -use_fast_math
@@ -67,7 +70,7 @@ racing:     CUCCOPT+=-DTEST_RACING
 mtatomic:   CUCCOPT+=-DUSE_MT_RAND -DUSE_ATOMIC -use_fast_math -arch compute_20 -DMCX_TARGET_NAME='"MT Atomic MCX"'
 logatomic:  CUCCOPT+=-DUSE_ATOMIC -use_fast_math -arch compute_20 -DMCX_TARGET_NAME='"LL5 Atomic MCX"'
 fermi fermimex fermioct:      CUCCOPT+=-DUSE_ATOMIC -use_fast_math
-fermimex:   CUCCOPT+=-DUSE_MT_RAND -use_fast_math
+#fermimex:   CUCCOPT+=-DUSE_MT_RAND -use_fast_math
 mtbox logbox:		CUCCOPT+=-DUSE_CACHEBOX -use_fast_math -arch compute_20 -DMCX_TARGET_NAME='"Cachebox MCX"'
 debugmt debuglog:	CUCCOPT+=-deviceemu
 mtatomic logatomic:	BINARY:=$(BINARY)_atomic
@@ -77,17 +80,20 @@ detbox mexbox octbox: CUCCOPT+=-DMCX_TARGET_NAME='"Cached Detective MCX"'
 fermi fermimex fermioct: CUCCOPT+=-DMCX_TARGET_NAME='"Fermi MCX"'
 det detbox: BINARY:=$(BINARY)_det
 logbox detbox:          BINARY:=$(BINARY)_cached
+all mt fast log logfast racing mtatomic logatomic mtbox logbox debugmt debuglog \
+ det detbox fermi:      LINKOPT+=-fopenmp
 
 mexbox mex fermimex:        AR=mex
-mexbox mex fermimex:        LINKOPT+= CXXFLAGS='$$CXXFLAGS -DSAVE_DETECTORS -DUSE_CACHEBOX'
-mexbox mex oct octbox fermimex:    OUTPUT_DIR=../mcxlab
-mex fermimex fermioct:      BINARY=mcxlab
+mexbox mex fermimex:        LINKOPT+= CXXFLAGS='$$CXXFLAGS -DSAVE_DETECTORS -DUSE_CACHEBOX -DMCX_CONTAINER -fopenmp' -lgomp LDFLAGS='$$LDFLAGS -fopenmp'
+mexbox mex oct octbox fermimex fermioct:    OUTPUT_DIR=../mcxlab
+mex fermimex:      BINARY=mcxlab
+oct fermioct:      BINARY=mcxlab.mex
 mexbox:                     BINARY=mcxlab_atom
-mexbox mex oct octbox fermimex fermioct:    CUCCOPT+=--compiler-options "$(DLLFLAG) -Wall" -DMCX_CONTAINER
+mexbox mex oct octbox fermimex fermioct:    CUCCOPT+=--compiler-options "$(DLLFLAG)" -DMCX_CONTAINER
 mexbox mex oct octbox fermimex fermioct:    CPPOPT+=$(DLLFLAG) -DMCX_CONTAINER
 mexbox mex fermimex:        LINKOPT+=mcxlab.cpp -cxx -outdir $(OUTPUT_DIR) $(INCLUDEDIRS)
 
-octbox oct fermioct:        AR= CXXFLAGS='-DSAVE_DETECTORS -DUSE_CACHEBOX' mkoctfile
+octbox oct fermioct:        AR= CXXFLAGS='-DSAVE_DETECTORS -DUSE_CACHEBOX -fopenmp' LDFLAGS='-fopenmp' mkoctfile
 oct:               BINARY=mcxlab.mex
 octbox:            BINARY=mcxlab_atom.mex
 octbox oct fermioct:        LINKOPT+=--mex mcxlab.cpp $(INCLUDEDIRS)

diff --git a/src/mcextreme.c b/src/mcextreme.c
@@ -18,23 +18,37 @@
 #include "tictoc.h"
 #include "mcx_utils.h"
 #include "mcx_core.h"
+#ifdef _OPENMP
+  #include <omp.h>
+#endif
 
 int main (int argc, char *argv[]) {
      Config  mcxconfig;
      GPUInfo *gpuinfo=NULL;
+     unsigned int activedev=0;
 
      mcx_initcfg(&mcxconfig);
 
      // parse command line options to initialize the configurations
      mcx_parsecmd(argc,argv,&mcxconfig);
 
      // identify gpu number and set one gpu active
-     if(!mcx_set_gpu(&mcxconfig,&gpuinfo)){
+     if(!(activedev=mcx_list_gpu(&mcxconfig,&gpuinfo))){
          mcx_error(-1,"No GPU device found\n",__FILE__,__LINE__);
      }
 
+#ifdef _OPENMP
+     omp_set_num_threads(activedev);
+     #pragma omp parallel
+     {
+#endif
+
      // this launches the MC simulation
-     mcx_run_simulation(&mcxconfig);
+     mcx_run_simulation(&mcxconfig,gpuinfo);
+
+#ifdef _OPENMP
+     }
+#endif
 
      // clean up the allocated memory in config and gpuinfo
      mcx_cleargpuinfo(&gpuinfo);