Skip to content

Commit

Permalink
removed atomic operations and replaced with reduction sum
Browse files Browse the repository at this point in the history
  • Loading branch information
sigrimm committed May 19, 2020
1 parent 3f9e55a commit c2968e9
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 39 deletions.
1 change: 1 addition & 0 deletions check/README
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ TestH014 Hit16 01 3 bins storeSK 2
TestH015 Hit16 01 3 bins storeSK 1 Resampling 1
TestH016 gfnew2600
TestH017 BT2
TestH018 BT2 cutMode 1

TestNIST001 nist_ELevels.py nist_partition.py nist_Lines.py nist_Lines2.py
TestKurucz001 Kurucu2.py
Expand Down
32 changes: 32 additions & 0 deletions check/TestH018/param.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name = i
T = 1100.0
P = 1000
PFile = -
Species Name = 1H2-16O__BT2
SpeciesFile = -
ciaSystem = -
pathToData = ../../data/
numin = 0.0
numax = 30000.0
dnu = 1.0
Nnu per bin = 0
cutMode = 1
cut = 100.0
doResampling = 0
nC = 20
doTransmission = 0
nTr = 1000
dTr = 0.05
doStoreFullK = 1
pathToK =
doStoreSK = 0
nbins = 5
binsFile = -
OutputEdgesFile = -
kmin = 1e-15
qalphaL = 1.0
gammaF = 1.0
doMean = 0
Units = 0
ReplaceFiles = 1
profile = 1
2 changes: 1 addition & 1 deletion check/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
'''
############################################################
#compare Heliok with older Version
for i in range(1,18):
for i in range(1,19):
#for i in range(10,16):
os.chdir('TestH%03d' % i)
print('TestH%03d' % i, file = f)
Expand Down
14 changes: 7 additions & 7 deletions define.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#endif


#define VERSION 2.0
#define VERSION 2.01


#define def_T0 296.0 //Reference Temperature in K
Expand All @@ -44,7 +44,7 @@
#define def_TOLF 2.48e-12f //Tolerance in the Voigt function
#define def_nthmax 1048576 //Maximum number of threads for line and sort kernels
#define def_nlmax 32768 //Maximum number of lines per kernel launch, to prevent from time out on Desktop machines
#define def_maxlines 1000000ll //maximum number of lines stored on the GPU, Should not be less than maximum in HITEMP lines, must be long long int type
#define def_maxlines 1048576ll //maximum number of lines stored on the GPU, Should not be less than maximum in HITEMP lines, must be long long int type
#define def_maxfiles 500 //maximum number of files per molecule
#define def_doTuning 1 //use the self-tuning routines

Expand Down Expand Up @@ -194,10 +194,10 @@ struct Line{
long long int *iiLimitsC0_h, *iiLimitsC0_d; //limits for Line blocks, min
long long int *iiLimitsC1_h, *iiLimitsC1_d; //limits for Line blocks, max

unsigned long long int *iiLimitsAT_m, *iiLimitsAT_d; //limits for all blocks, mapped memory
unsigned long long int *iiLimitsALT_m, *iiLimitsALT_d; //limits for all blocks, mapped memory
unsigned long long int *iiLimitsART_m, *iiLimitsART_d; //limits for all blocks, mapped memory
unsigned long long int *iiLimitsBT_m, *iiLimitsBT_d; //limits for all blocks, mapped memory
unsigned long long int *iiLimitsCT_m, *iiLimitsCT_d; //limits for all blocks, mapped memory
long long int *iiLimitsAT_m, *iiLimitsAT_d; //limits for all blocks, mapped memory
long long int *iiLimitsALT_m, *iiLimitsALT_d; //limits for all blocks, mapped memory
long long int *iiLimitsART_m, *iiLimitsART_d; //limits for all blocks, mapped memory
long long int *iiLimitsBT_m, *iiLimitsBT_d; //limits for all blocks, mapped memory
long long int *iiLimitsCT_m, *iiLimitsCT_d; //limits for all blocks, mapped memory

};
48 changes: 23 additions & 25 deletions heliosk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1117,41 +1117,39 @@ printf("%g %g %g %g\n", param.numax, param.numin, param.dnu, (param.numax - para
//Compute the opacity function K(x)
//************************************

int nlLimitsA = (NL + def_nlA - 1)/ def_nlA;
int nlLimitsB = (NL + def_nlB - 1)/ def_nlB;
int nlLimitsC = (NL + def_nlC - 1)/ def_nlC;

L.iiLimitsAT_m[0] = Nx;
L.iiLimitsAT_m[1] = 0ull;
L.iiLimitsALT_m[0] = Nx;
L.iiLimitsALT_m[1] = 0ull;
L.iiLimitsART_m[0] = Nx;
L.iiLimitsART_m[1] = 0ull;
L.iiLimitsBT_m[0] = Nx;
L.iiLimitsBT_m[1] = 0ull;
L.iiLimitsCT_m[0] = Nx;
L.iiLimitsCT_m[1] = 0ull;
int nlLimitsA = (NL + def_nlA - 1) / def_nlA;
int nlLimitsB = (NL + def_nlB - 1) / def_nlB;
int nlLimitsC = (NL + def_nlC - 1) / def_nlC;


//A
nuLimits_kernel<<< nlLimitsA, min(def_nlA, 1024), 0, nuLimitsStream[0] >>> (L.nu_d, L.ialphaD_d, L.vy_d, L.vcut2_d, L.nuLimitsA0_d, L.nuLimitsA1_d, param.numin, param.numax, def_nlA, NL, param.profile, 10);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[0] >>> (L.nuLimitsA0_d, L.nuLimitsA1_d, L.iiLimitsA0_d, L.iiLimitsA1_d, L.iiLimitsAT_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 10);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[0] >>> (L.nuLimitsA0_d, L.nuLimitsA1_d, L.iiLimitsA0_d, L.iiLimitsA1_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 10);

iiLimitsMax_kernel< 512 > <<< 1, 512 >>> (L.iiLimitsA0_d, L.iiLimitsA1_d, L.iiLimitsAT_d, Nx, nlLimitsA);

if(param.profile == 1){ //only for voigt profiles
//AL
nuLimits_kernel<<< nlLimitsA, min(def_nlA, 1024), 0, nuLimitsStream[1] >>> (L.nu_d, L.ialphaD_d, L.vy_d, L.vcut2_d, L.nuLimitsAL0_d, L.nuLimitsAL1_d, param.numin, param.numax, def_nlA, NL, param.profile, 11);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[1] >>> (L.nuLimitsAL0_d, L.nuLimitsAL1_d, L.iiLimitsAL0_d, L.iiLimitsAL1_d, L.iiLimitsALT_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 11);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[1] >>> (L.nuLimitsAL0_d, L.nuLimitsAL1_d, L.iiLimitsAL0_d, L.iiLimitsAL1_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 11);

iiLimitsMax_kernel< 512 > <<< 1, 512 >>> (L.iiLimitsAL0_d, L.iiLimitsAL1_d, L.iiLimitsALT_d, Nx, nlLimitsA);

//AR
nuLimits_kernel<<< nlLimitsA, min(def_nlA, 1024), 0, nuLimitsStream[2] >>> (L.nu_d, L.ialphaD_d, L.vy_d, L.vcut2_d, L.nuLimitsAR0_d, L.nuLimitsAR1_d, param.numin, param.numax, def_nlA, NL, param.profile, 12);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[2] >>> (L.nuLimitsAR0_d, L.nuLimitsAR1_d, L.iiLimitsAR0_d, L.iiLimitsAR1_d, L.iiLimitsART_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 12);
iiLimits_kernel <<< (nlLimitsA + 127) / 128, 128, 0, nuLimitsStream[2] >>> (L.nuLimitsAR0_d, L.nuLimitsAR1_d, L.iiLimitsAR0_d, L.iiLimitsAR1_d, binBoundaries_d, nlLimitsA, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 12);

iiLimitsMax_kernel< 512 > <<< 1, 512 >>> (L.iiLimitsAR0_d, L.iiLimitsAR1_d, L.iiLimitsART_d, Nx, nlLimitsA);
//B
nuLimits_kernel<<< nlLimitsB, min(def_nlB, 1024), 0, nuLimitsStream[3] >>> (L.nu_d, L.ialphaD_d, L.vy_d, L.vcut2_d, L.nuLimitsB0_d, L.nuLimitsB1_d, param.numin, param.numax, def_nlB, NL, param.profile, 20);
iiLimits_kernel <<< (nlLimitsB + 127) / 128, 128, 0, nuLimitsStream[3] >>> (L.nuLimitsB0_d, L.nuLimitsB1_d, L.iiLimitsB0_d, L.iiLimitsB1_d, L.iiLimitsBT_d, binBoundaries_d, nlLimitsB, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 20);
iiLimits_kernel <<< (nlLimitsB + 127) / 128, 128, 0, nuLimitsStream[3] >>> (L.nuLimitsB0_d, L.nuLimitsB1_d, L.iiLimitsB0_d, L.iiLimitsB1_d, binBoundaries_d, nlLimitsB, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 20);

iiLimitsMax_kernel< 512 > <<< 1, 512 >>> (L.iiLimitsB0_d, L.iiLimitsB1_d, L.iiLimitsBT_d, Nx, nlLimitsB);
//C
nuLimits_kernel<<< nlLimitsC, min(def_nlC, 1024), 0, nuLimitsStream[4] >>> (L.nu_d, L.ialphaD_d, L.vy_d, L.vcut2_d, L.nuLimitsC0_d, L.nuLimitsC1_d, param.numin, param.numax, def_nlC, NL, param.profile, 30);
iiLimits_kernel <<< (nlLimitsC + 127) / 128, 128, 0, nuLimitsStream[4] >>> (L.nuLimitsC0_d, L.nuLimitsC1_d, L.iiLimitsC0_d, L.iiLimitsC1_d, L.iiLimitsCT_d, binBoundaries_d, nlLimitsC, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 30);
iiLimits_kernel <<< (nlLimitsC + 127) / 128, 128, 0, nuLimitsStream[4] >>> (L.nuLimitsC0_d, L.nuLimitsC1_d, L.iiLimitsC0_d, L.iiLimitsC1_d, binBoundaries_d, nlLimitsC, param.numin, param.dnu, Nx, param.useIndividualX, param.nbins, param.Nxb, 30);

iiLimitsMax_kernel< 512 > <<< 1, 512 >>> (L.iiLimitsC0_d, L.iiLimitsC1_d, L.iiLimitsCT_d, Nx, nlLimitsC);
}


Expand All @@ -1162,11 +1160,11 @@ printf("%g %g %g %g\n", param.numax, param.numin, param.dnu, (param.numax - para
cudaEventSynchronize(iiLimitsEvent);


unsigned long long int nTA = L.iiLimitsAT_m[1] - L.iiLimitsAT_m[0];
unsigned long long int nTAL = L.iiLimitsALT_m[1] - L.iiLimitsALT_m[0];
unsigned long long int nTAR = L.iiLimitsART_m[1] - L.iiLimitsART_m[0];
unsigned long long int nTB = L.iiLimitsBT_m[1] - L.iiLimitsBT_m[0];
unsigned long long int nTC = L.iiLimitsCT_m[1] - L.iiLimitsCT_m[0];
long long int nTA = L.iiLimitsAT_m[1] - L.iiLimitsAT_m[0];
long long int nTAL = L.iiLimitsALT_m[1] - L.iiLimitsALT_m[0];
long long int nTAR = L.iiLimitsART_m[1] - L.iiLimitsART_m[0];
long long int nTB = L.iiLimitsBT_m[1] - L.iiLimitsBT_m[0];
long long int nTC = L.iiLimitsCT_m[1] - L.iiLimitsCT_m[0];

if(ntA < 0) ntA = 0ll;
if(ntAL < 0) ntAL = 0ll;
Expand All @@ -1181,7 +1179,6 @@ printf("%g %g %g %g\n", param.numax, param.numin, param.dnu, (param.numax - para
printf("C Limits %lld %lld | %lld\n", L.iiLimitsCT_m[0], L.iiLimitsCT_m[1], nTC);



if(nTA > 0){
cudaMemcpyAsync(L.iiLimitsA0_h, L.iiLimitsA0_d, nlLimitsA * sizeof(long long int), cudaMemcpyDeviceToHost, nuLimitsStream[0]);
cudaMemcpyAsync(L.iiLimitsA1_h, L.iiLimitsA1_d, nlLimitsA * sizeof(long long int), cudaMemcpyDeviceToHost, nuLimitsStream[0]);
Expand All @@ -1206,6 +1203,7 @@ printf("%g %g %g %g\n", param.numax, param.numin, param.dnu, (param.numax - para
}



double timeOld = time[0];
long long lPartOld = lPart;
int fii = fi;
Expand Down
91 changes: 85 additions & 6 deletions voigt.h
Original file line number Diff line number Diff line change
Expand Up @@ -1658,7 +1658,7 @@ __global__ void iiLimitsCheck(long long int *iiLimitsA0_d, long long int *iiLim
}


__global__ void iiLimits_kernel(double *nuLimits0_d, double* nuLimits1_d, long long int *iiLimits0_d, long long int *iiLimits1_d, unsigned long long int *iiLimitsT_d, double *binBoundaries_d, const int NLimits, const double numin, const double dnu, const int Nx, const int useIndividualX, const int nbins, const int Nxb, const int EE){
__global__ void iiLimits_kernel(double *nuLimits0_d, double* nuLimits1_d, long long int *iiLimits0_d, long long int *iiLimits1_d, double *binBoundaries_d, const int NLimits, const double numin, const double dnu, const int Nx, const int useIndividualX, const int nbins, const int Nxb, const int EE){

int idx = threadIdx.x;
int id = blockIdx.x * blockDim.x + idx;
Expand Down Expand Up @@ -1703,18 +1703,97 @@ __global__ void iiLimits_kernel(double *nuLimits0_d, double* nuLimits1_d, long l
iiLimits0_d[id] = ii00;
iiLimits1_d[id] = ii11;

unsigned long long int ii00u = (unsigned long long int)(ii00);
unsigned long long int ii11u = (unsigned long long int)(ii11);
//if(id == 683) printf("iilimitsK %d %d %g %g %lld %lld\n", EE, id, nu00, nu11, ii00, ii11);
}

}

// ********************************************************
// This kernel finds the minimum and maxmun of the iiLimits
// and stores them in iiLimitsT
// it uses a parallel reduction sum with only 1 thread block

//Author: Simon Grimm
//Date: May 2020
// ********************************************************
template <int nb>
__global__ void iiLimitsMax_kernel(long long int *iiLimits0_d, long long int *iiLimits1_d, long long int *iiLimitsT_d, const int Nx, const int nl){


int idy = threadIdx.x;

__shared__ long long int ii00_s[nb];
__shared__ long long int ii11_s[nb];

ii11_s[idy] = 0LL;
ii00_s[idy] = (long long int)(Nx);

__syncthreads();

for(int k = 0; k < nl; k += nb){
if(idy + k < nl){
ii00_s[idy] = min(ii00_s[idy], iiLimits0_d[idy + k]);
ii11_s[idy] = max(ii11_s[idy], iiLimits1_d[idy + k]);
}
}
__syncthreads();

if(nb >= 512){
if(idy < 256){
ii00_s[idy] = min(ii00_s[idy], ii00_s[idy + 256]);
ii11_s[idy] = max(ii11_s[idy], ii11_s[idy + 256]);
}
}
__syncthreads();

if(nb >= 256){
if(idy < 128){
ii00_s[idy] = min(ii00_s[idy], ii00_s[idy + 128]);
ii11_s[idy] = max(ii11_s[idy], ii11_s[idy + 128]);
}
}
__syncthreads();

if(nb >= 128){
if(idy < 64){
ii00_s[idy] = min(ii00_s[idy], ii00_s[idy + 64]);
ii11_s[idy] = max(ii11_s[idy], ii11_s[idy + 64]);
}
}
__syncthreads();
if(idy < 32){
volatile long long int *ii00 = ii00_s;
volatile long long int *ii11 = ii11_s;

atomicMin(&iiLimitsT_d[0], ii00u);
atomicMax(&iiLimitsT_d[1], ii11u);
ii00[idy] = min(ii00[idy], ii00[idy + 32]);
ii11[idy] = max(ii11[idy], ii11[idy + 32]);

//if(id < 10) printf("iilimitsK %d %d %g %g %lld %lld | %lld %lld\n", EE, id, nu00, nu11, ii00, ii11, iiLimitsT_d[0], iiLimitsT_d[1]);
ii00[idy] = min(ii00[idy], ii00[idy + 16]);
ii11[idy] = max(ii11[idy], ii11[idy + 16]);

ii00[idy] = min(ii00[idy], ii00[idy + 8]);
ii11[idy] = max(ii11[idy], ii11[idy + 8]);

ii00[idy] = min(ii00[idy], ii00[idy + 4]);
ii11[idy] = max(ii11[idy], ii11[idy + 4]);

ii00[idy] = min(ii00[idy], ii00[idy + 2]);
ii11[idy] = max(ii11[idy], ii11[idy + 2]);

ii00[idy] = min(ii00[idy], ii00[idy + 1]);
ii11[idy] = max(ii11[idy], ii11[idy + 1]);
}
__syncthreads();

if(idy == 0){
iiLimitsT_d[0] = ii00_s[0];
iiLimitsT_d[1] = ii11_s[0];
}

}



//This kernel finds the minimal and maximal index in wavenumbers for each block of lines
//EE 10: A
//EE 11: AL
Expand Down

0 comments on commit c2968e9

Please sign in to comment.