From a26fb009e8c0ca6cb2c2fae3a36cb60bde6a6ed3 Mon Sep 17 00:00:00 2001 From: enjalot Date: Wed, 3 Aug 2011 15:13:51 -0400 Subject: [PATCH] added radix but seems to be slower than bitonic on mac with few particles --- rtpslib/opencl/bitonic_sort/Makefile | 51 -- .../bitonic_sort/obj/release/main.cpp.o | Bin 4960 -> 0 bytes .../obj/release/oclBitonicSort_launcher.cpp.o | Bin 7476 -> 0 bytes .../release/oclSortingNetworks_validate.cpp.o | Bin 2884 -> 0 bytes rtpslib/opencl/bitonic_sort/src/main.cpp | 190 ------ .../src/oclBitonicSort_launcher.cpp | 230 ------- .../src/oclSortingNetworks_common.h | 72 -- .../src/oclSortingNetworks_validate.cpp | 114 ---- rtpslib/system/SPH.cpp | 21 + rtpslib/system/SPH.h | 3 + rtpslib/system/common/Radix.cpp | 317 +++++++++ rtpslib/system/common/Radix.h | 77 +++ rtpslib/system/common/cl_src/RadixSort.cl | 630 ++++++++++++++++++ rtpslib/system/common/cl_src/Scan_b.cl | 190 ++++++ 14 files changed, 1238 insertions(+), 657 deletions(-) delete mode 100644 rtpslib/opencl/bitonic_sort/Makefile delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o delete mode 100644 rtpslib/opencl/bitonic_sort/src/main.cpp delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp create mode 100644 rtpslib/system/common/Radix.cpp create mode 100644 rtpslib/system/common/Radix.h create mode 100644 rtpslib/system/common/cl_src/RadixSort.cl create mode 100644 rtpslib/system/common/cl_src/Scan_b.cl diff --git a/rtpslib/opencl/bitonic_sort/Makefile b/rtpslib/opencl/bitonic_sort/Makefile deleted file mode 100644 index 8777dbf0..00000000 --- a/rtpslib/opencl/bitonic_sort/Makefile +++ /dev/null @@ -1,51 +0,0 @@ -################################################################################ -# -# Copyright 1993-2009 NVIDIA Corporation. All rights reserved. -# -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. -# -################################################################################ -# -# Build script for project -# -################################################################################ - -# Add source files here -EXECUTABLE := oclSortingNetworks -# C/C++ source files (compiled with gcc / c++) -SRCDIR := src/ -CCFILES := main.cpp oclSortingNetworks_validate.cpp oclBitonicSort_launcher.cpp - -################################################################################ -# Rules and targets - -# NOTE: Assuming P4 based install until OpenCL becomes public -# Adjust P4_Root, default is ${HOME}/perforce/ -# P4_ROOT=${HOME}/myperforce/ - -include ../../common/common_opencl.mk - - diff --git a/rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o deleted file mode 100644 index 729e9bfa4c9862b88450e87b8b50d35399830401..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4960 zcmb7Ie{2-T6&{ldHir0|wn?GQHEXb2Xt2Yj5NMS|K732p${8GNr$h>Dj`P@e$^9^U zdtj$QBEr{tbU1aDQh`t?O8zLSKvknq5hB2XDi)R7q*Q81k!VN?4WUU%)j&!luHV~z z2X_mk?Z~s=esA8qA2YMN@BGH)53f`UQFV*J1)Z>U3N9Kt+AQ=FG;otPR^tvlL37Yb z95w~@TS#Mr2@zUMTr@D7!&xhL!wL+{Vkxy(QC^gfJWo;{w?q`8% zR@&Rmb`-~6GtJGdG@rvQQN$h3b0{ri3xPzRqW7K?`3H$X7@AlZDME=D2mhzbi>8XR}! zt`4)qLbSU2?dtVw#8CM->{SS!@CsIu;B6dC$F6f}Di(zZsj08ga*dUXd^?TeQRCUq zgs2y}vxf+b!a;I!gQ$$rjc%~&sM06%+Rd3D$ZK#;~itQl|M%cUP|o3nS3IQ96quxNQ~4mrRo( ztjdV9A6L)9LSiH#mAxO>isZsR{t~#`^aDUXrX6!{mJeNO3xGz$?Y?+&h^$ImEF6Kuq$`h z*1dPny*fI5^`vqhrr|##GW8eNv>V>5FGFOgBkGhVtWkRZiQvVk~{7uhJ7Ya1?R==4CKdi4aUs}{lk0V?&|xxhE*#U$SAmbURdIo>cXbX&xcXRX-1sc0hI z5*rvm9B8##Lqmh6TUN%>=4ed`ZEjO`o))#WWFjqz&({)ZtuK)T$&xO7>Vxo^?!TEQ zZEeENq!TeMiv*|elWxC&v zYfU-ClyhZS3dW90ZEe(!#pgMuJ40PvdRQzAwMPI=Ss3by=WNGWZ)QWCAo1FTxnv`v)#$?-Y>ocne5z9|Iz#z!Bhl@YjHt8u$eee}R{9p?OlpurV?~^*M8X$=G@m}SHJJYs z;Mc)7v3v-KGBXeZ;x|-tEAT78nLwQF=8teH=K;?E$<8StLJYhGoCE$V;5UIg8Mg!P z1s?%YoEsSX7$0M#*CyFBfCxPhVoqNZxDLRlXu)_R5G{bc5?>Nx4I%3NI+A}uh{p*b zpMiU-{X3{0rL`#it~$c}Fb)H?*O|Y`{I|@{GXFjEE6k5Gug4-$`xEmp^FK51WPXbI zAoF*b?_z$I`ElmwnNzJ#?E>?6n190j4D-v(FEKANr_H72L++G+HS=lAXEUdmko=}9=;dz%Ew>u@bw-pJsgF1#rz%*Z}Rwiz{79%@Se1ig8K3Rp%RTySo^d?$75Sz;`gry##@BepY1<*X+VJdE@HagC zJr5TqedmRljayyPj%50m;S;``{+&P~a8gJnGcj;0n(h^*d&QSN`DiqmlUZS+2KlbE zJCjknFJq#FEZ;UAATL9YuLkCP<}V^^u&+!D9kN%2Kx|prYoQ z+R|KKpR|O@6+>xIPp_kiD`m8|q_G|~4$2!zDzf)sIRx*OvQpH2mW0b#GLw}apDSfj zMzgX*rdBv*2WhdaEmIzEnV9p!QaCkIm-{~+UPFCQm=eX1v!-ov8GEWN)f++Kv@99L z0-I*m?v<7`Ak4nLWG)*QX1_$;5=F+Q**`dF4oEATNh39T$Xvg`-Ln=M7m>CF=t1=R u(7%Pg5PcE){pgF)m!N+eJ?JQsW{^~aq#GpVAZZ6lJ-ArthWo#yp!g3nAVDbr diff --git a/rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o deleted file mode 100644 index e41805406994004394bc6bd5462cca1b53078625..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7476 zcmbVRe{dA_72imJBtR~$psBjDNN7<>;P@eEwRn<+jom;DL^DcQlFO1@x!fIlcN%FK ziOEs6$0<~-X&pO^qa#c^CH|775vyYdD7K`ZxBE7Cw+Vvo zWcRb5_ult?-}k-me($@R*Up_fKT#0!#tFFKBz*faTqK-0zJary_yvvv4=bo4j-$mk z!l&SzB+;~(-X61cHeVyxugP%d?%$L64*v3Sd-Q99u%ATv!d3$3&@_KEW&}blEYP%E ze`ncGF~@!uadC)N3-^#TttlFHoC7tj(znK!5jf+Tml5Uoquga5h%X$9W(Mr`SHO8H z;6?=)C&gEDrCo~D}8maxBi_?rS5($UH)3phR_?vI5H zNZkIM@j7j&4qtR=TC;mx7W^ zSgt*l50R(EE9BJD|t)JalKD;edmz%%CutCiet)p!&HItN&3uH2ri2)>P%uLuhNFjVPQA0ss zsJN}S;<-p>k|}CYWu`Bfn^tt&v87}fwe@U!^ZfAefILJi?SMQ;6R(#p=&=+%F0nfw z!tL$$?I3Oy+;*I!>G*(nq6dJvUkPW^w<6WTZPq;9)(yFn;AD!@otvS;(ig!P#tQ$3lYvD$iCW2_nGGY@6j zQ-*9k{nlCfNe$a4HEeZH`)LS1XS!d~M3=os)TCAT3zqXd5}lL>@c_w#H1&UgiYDcu zy!6d-iRa-n{~aDS55DnX>|bWxpeYZXSO#?_{LyvUkx_n?x^#`z{43NYllQUflBwI5 zsLPH?4PEZK7&K4ET8=KMKC1n=U8F``K_uNAux5vvq5%mnzJ|-mD?HN$^Q|2D0mv2e zp!s(tWi9RAw53*3)2NyE^vg2|Y#}#wd4357Ctpe&JMigI{B%?vb#?F%sh@JS`6&sv zyDWD@cp`!cenDS5$YLcdTrHm_`AsCx9rSY~zjn0!5t2`U9QmN%W5sNETQ&9Ls@e95 zYR1nhogYa&v8-(POy?1{D1O!K)q<7I^UKtETrHaQTNIzBB(Et`lg(uXYBE?>q9$Wy zrAPW*V`)NcU$P)M>tXKxqVprA$~<3Xo>9y{xhkAozyd2y#T-ymZE4l~P)S)!K1=bV zcTB-^{5@Wm7nY&7MoJ>2}$e{gWDoOuGVZ7DI?80 z4=GtYqYFOebKvkxJ^`+0h~Ak_zT{dA7LxEem*o-?{@5kF1;P`XM=u+^-^O@3I;+t@i|f;By9);L84?^R3ymlff(LMD=I3A1YCmQ#%53{tB(qQut><| z!rZx1^@EZ>+}5UtVnq}i9cb@2^hWY0HQ7N*QA8<4jBpEH106%yHN^u#tD}q9jqPuv zZiMt;v`FCAlnOQMZw!tScpXW#ZnWqtg5jo&ktMR5W#Vtu{TnI%wm>M*7H^ZH@kj*T z^=9dTu(7el2*)E*G_YBxu@uR*wX15aq*t>!8I@QK>N! z2?qQ~Q#d4nhQhI|ULdoKn)Ukwe!T(}DzB8(Rm*&;wz}$GSyQX3tJd7M$UEN-*}7iC z_j#LZYK6S^k_k^@Z!DBdo>;Jxo-Bc;ElJK|9?_AajUOHPq%JPmc3D2{ccx6pr&pKG zi905w#y^t(=s0(~erT;@RJ*AVKQbFZ7Vri^ zDE|ZV0p`y$-_QK#%%5OxGJlwPjCmV#jd?xuyMVLN&z-;u;H^Nc<>De>IsPhu$dD)k zafnv{DX!Bf{`J69KnzX%3y}N_0m)Avko>#=B)eY$@hPBuACUY$4MfI716T&W7KoCF zE153=;)h7Qf%#M*>L*SB&V>D`Lfg;#z{?^3GmzT93Y4H41b!X74_E;FB@ks1pJn|X zAbuUhpRm3QNd0YLeLD~{K@72eEs*-JV|@+K1HPQ~OMx`5B|v=A5zBzn!Kbml7>K=~ zI1xy80&oiW#|Vz%`v8|2kPiV#e;7C!@_oP?!Jh#l_2OedtnB5F0Fer@3rO}`fN&|U z0?r0kfK-QDfK-PCK(dRl!tvl%EQw@%EOaD%EOO<({uVg-j z`58Q8ZYGvCB~9rF*6REp=S^?PXMj)G9O}|X1*DS zp@;zC3Ou^qK#+BW_)RI^4aBF+wSBE9(Hju? zyPCPmyqx)6%x`CYH}iVt_c7njd_8d#-1$BqXZ^>l-%K16%=zAqbO;{0@KJ+%5A^dm zKkFbT{mzND!FtxWvi>pFug79ZN1XM0SbqyR{zi^>ahJ_MbMZkJr|;s~_Tyasu5$5+ z%f7?KDXeUN^)9~9fEjEgUK@s%zvXj%JILGuS!>#^@zTUejbLR=1QitBNG zWo2z+s71F`f`+ZWwU=LQ|IbK^wi;>}_BP7xf~I5hPVK`{{f^8_0ZD50pxziAaWx{S zv3KHvhW%8#KZsyhRg&^1Dmsmtc!^x~Wk*Y~R4oBf+A^$m^jU{+|hW{H1eR(kBW z1a2});kBB~pEuGZ`(?w`DC?!e(k$>tvid<}3ve#Pxd`V?I2Yqwg7apaUMqZxANzk? f$kw~i4tOEn#v&-*stCPUL`kzqkQMv?Z(ia5$}D|c diff --git a/rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o deleted file mode 100644 index 498398867a963001c0a54a999224b6b7b66992de..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2884 zcma)8U2GIp6u#RIwARw8@)IO1okoZaVr3PiV`E^#uFS$#&~}kTV6E-6-OzUH&P>zd zgT(EYj6;$5qKQ7#q-r00AO@2@^kECNKcqpdL7PRgJt$%~o zIjd^WZ=F(mUZHt>sVM?GJeZsmyYkVQbzU91lA~IIR50uqRo|UTt@&)}p#a`yO&MK8ly3AOW^>>vu<#VV$XH>pyt_V7# zq1={~=aLWew$GNf)JJ_NxK~e~GWEbxWnN#E4M2LhSw+ zCM%^lLbsixIQ}GahWzrc=9%P0!>8B_mC67UlOG|-H;a_Z#gaq#j`goODk!$UM^K$1 z`K0--Ga_FyrxkmXUDcZ$7rW@CNBmMAkEqJ<2{G{@%+^TH>mF>XEgY6d+OOK?@caey zrDhi2!-)N?n6sY`kC>m;-Q%nrPGHy$^vQu?qZrFOpnAgnLC-JKi959abAKT2;OT>S zFMQ*7i@z=&-b}BG+nfHaH8S&Cd3kwzZF97~*1#Dm>9M$>)l0CLnl9DGi8e=#PN^*(Yt!Ne(yY-7 z3fMiTJD_#yn8p=%&XCsHXf#PiEGFqqC~2LfbwsJa0;;82Go&@9gi+}VsZdR7jTusN zv?WUQrzdUhbSq8goj#0w z^mNQ~aMv5m(^0(eGrBN}3-j`i(QlTmgOcSY_JL#-8A$|k7ZrEs2z0zIqO+D{19Wcu zb~g(dv*HE1eG+*`7WOPSDtwfiJ+3Ts(LH3`0nEOHvFGZ6=*U~n`7=P&3Piu&jXLznh>YzgEW+mq5O>#JLf8xU0H1)Luo(Ut?k9x3undTm+fRslu4py) z6T)6d_Z_tAb6^n)%KV?`<$Msulk`E!f0pxp&M$NR4d+vwALl#|&61pe()cpY2RK(b zALRTH=PAyQb3V-Z70&5!Ey7%x|0`HKCenk;J|OL7J~n~mSt$A2cubNw{B7J%Zx{I` zAW1Wn{1x244V(`pH`|=WpU&be(hzNK-t8VVEW(c6 UZh}@oWvFjG3#jS8i$2Ew2W2aa5dZ)H diff --git a/rtpslib/opencl/bitonic_sort/src/main.cpp b/rtpslib/opencl/bitonic_sort/src/main.cpp deleted file mode 100644 index 50175c05..00000000 --- a/rtpslib/opencl/bitonic_sort/src/main.cpp +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -//Standard utilities and systems includes -#include -#include "oclSortingNetworks_common.h" - -//////////////////////////////////////////////////////////////////////////////// -//Test driver -//////////////////////////////////////////////////////////////////////////////// -int main(int argc, const char **argv) -{ - cl_platform_id cpPlatform; - cl_device_id cdDevice; - cl_context cxGPUContext; - cl_command_queue cqCommandQueue; - cl_mem d_InputKey, d_InputVal, d_OutputKey, d_OutputVal; - - cl_int ciErrNum; - uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; - - const uint dir = 1; - const uint N = 1048576; - const uint numValues = 65536; - - // set logfile name and start logs - shrSetLogFileName ("oclSortingNetworks.txt"); - shrLog("%s Starting...\n\n", argv[0]); - - shrLog("Initializing data...\n"); - h_InputKey = (uint *)malloc(N * sizeof(uint)); - h_InputVal = (uint *)malloc(N * sizeof(uint)); - h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); - h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); - srand(2009); - for (uint i = 0; i < N; i++) - h_InputKey[i] = rand() % numValues; - fillValues(h_InputVal, N); - - shrLog("Initializing OpenCL...\n"); - //Get the NVIDIA platform - ciErrNum = oclGetPlatformID(&cpPlatform); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Get the devices - ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Create the context - cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Create a command-queue - cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - - shrLog("Initializing OpenCL bitonic sorter...\n"); - initBitonicSort(cxGPUContext, cqCommandQueue, argv); - - shrLog("Creating OpenCL memory objects...\n\n"); - d_InputKey = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N * sizeof(cl_uint), h_InputKey, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - d_InputVal = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N * sizeof(cl_uint), h_InputVal, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - d_OutputKey = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(cl_uint), NULL, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - d_OutputVal = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(cl_uint), NULL, &ciErrNum); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Temp storage for key array validation routine - uint *srcHist = (uint *)malloc(numValues * sizeof(uint)); - uint *resHist = (uint *)malloc(numValues * sizeof(uint)); - -#ifdef GPU_PROFILING - cl_event startTime, endTime; -#endif - - int globalFlag = 1;// init pass/fail flag to pass - for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) - { - shrLog("Test array length %u (%u arrays in the batch)...\n", arrayLength, N / arrayLength); - -#ifdef GPU_PROFILING - clFinish(cqCommandQueue); - ciErrNum = clEnqueueMarker(cqCommandQueue, &startTime); - oclCheckError(ciErrNum, CL_SUCCESS); - shrDeltaT(0); -#endif - - size_t szWorkgroup = bitonicSort( - NULL, - d_OutputKey, - d_OutputVal, - d_InputKey, - d_InputVal, - N / arrayLength, - arrayLength, - dir - ); - oclCheckError(szWorkgroup > 0, true); - -#ifdef GPU_PROFILING - if (arrayLength == N) - { - ciErrNum = clEnqueueMarker(cqCommandQueue, &endTime); - oclCheckError(ciErrNum, CL_SUCCESS); - clFinish(cqCommandQueue); - double timerValue = shrDeltaT(0); - shrLogEx(LOGBOTH | MASTER, 0, "oclSortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); - - cl_ulong startTimeVal = 0, endTimeVal = 0; - ciErrNum = clGetEventProfilingInfo( - startTime, - CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), - &startTimeVal, - NULL - ); - - ciErrNum = clGetEventProfilingInfo( - endTime, - CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), - &endTimeVal, - NULL - ); - - shrLog("OpenCL time: %.5f s\n", 1.0e-9 * (double)(endTimeVal - startTimeVal)); - } -#endif - - //Reading back results from device to host - ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_OutputKey, CL_TRUE, 0, N * sizeof(cl_uint), h_OutputKeyGPU, 0, NULL, NULL); - oclCheckError(ciErrNum, CL_SUCCESS); - ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_OutputVal, CL_TRUE, 0, N * sizeof(cl_uint), h_OutputValGPU, 0, NULL, NULL); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Check if keys array is not corrupted and properly ordered - int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, dir, srcHist, resHist); - - //Check if values array is not corrupted - int valuesFlag = validateSortedValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); - - // accumulate any error or failure - globalFlag = globalFlag && keysFlag && valuesFlag; - } - - // pass or fail (cumulative... all tests in the loop) - shrLog("%s\n\n", globalFlag ? "PASSED" : "FAILED"); - - // Start Cleanup - shrLog("Shutting down...\n"); - //Discard temp storage for key validation routine - free(srcHist); - free(resHist); - - //Release kernels and program - closeBitonicSort(); - - //Release other OpenCL Objects - ciErrNum = clReleaseMemObject(d_OutputVal); - ciErrNum |= clReleaseMemObject(d_OutputKey); - ciErrNum |= clReleaseMemObject(d_InputVal); - ciErrNum |= clReleaseMemObject(d_InputKey); - ciErrNum |= clReleaseCommandQueue(cqCommandQueue); - ciErrNum |= clReleaseContext(cxGPUContext); - oclCheckError(ciErrNum, CL_SUCCESS); - - //Release host buffers - free(h_OutputValGPU); - free(h_OutputKeyGPU); - free(h_InputVal); - free(h_InputKey); - - //Finish - shrEXIT(argc, argv); -} diff --git a/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp b/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp deleted file mode 100644 index 5ce03f5d..00000000 --- a/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -//#include -#include "CL/cl.hpp" -#include -#include -#include "oclSortingNetworks_common.h" - -extern "C" void closeBitonicSort(void); - -//////////////////////////////////////////////////////////////////////////////// -// OpenCL launcher for bitonic sort kernel -//////////////////////////////////////////////////////////////////////////////// -//OpenCL bitonic sort program -static cl_program cpBitonicSort; - -//OpenCL bitonic sort kernels -static cl_kernel - ckBitonicSortLocal, - ckBitonicSortLocal1, - ckBitonicMergeGlobal, - ckBitonicMergeLocal; - -//Default command queue for bitonic kernels -static cl_command_queue cqDefaultCommandQue; - -static const uint LOCAL_SIZE_LIMIT = 512U; -static const char *compileOptions = "-D LOCAL_SIZE_LIMIT=512"; - -extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){ - cl_int ciErrNum; - size_t kernelLength; - - //shrLog("...loading BitonicSort.cl\n"); - //char *cBitonicSort = oclLoadProgSource(shrFindFilePath("BitonicSort.cl", argv[0]), "// My comment\n", &kernelLength); - //oclCheckError(cBitonicSort != NULL, shrTRUE); - - std::string paths(BITONIC_CL_SOURCE_DIR); - paths = paths + "/BitonicSort.cl"; - const char* pathr = paths.c_str(); - FILE* fd =fopen(pathr, "r"); - //printf("fd= %d\n", fd); - char* cBitonicSort = new char [30000]; - int nb = fread(cBitonicSort, 1, 30000, fd); - kernelLength = nb; - //printf("pathr= %s\n", pathr); - //printf("cBitonicSort= %s\n", cBitonicSort); - - //shrLog("...creating bitonic sort program\n"); - cpBitonicSort = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBitonicSort, &kernelLength, &ciErrNum); - //oclCheckError(ciErrNum, CL_SUCCESS); - - //shrLog("...building bitonic sort program\n"); - ciErrNum = clBuildProgram(cpBitonicSort, 0, NULL, compileOptions, NULL, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - - //shrLog( "...creating bitonic sort kernels\n"); - ckBitonicSortLocal = clCreateKernel(cpBitonicSort, "bitonicSortLocal", &ciErrNum); - //oclCheckError(ciErrNum, CL_SUCCESS); - ckBitonicSortLocal1 = clCreateKernel(cpBitonicSort, "bitonicSortLocal1", &ciErrNum); - //oclCheckError(ciErrNum, CL_SUCCESS); - ckBitonicMergeGlobal = clCreateKernel(cpBitonicSort, "bitonicMergeGlobal", &ciErrNum); - //oclCheckError(ciErrNum, CL_SUCCESS); - ckBitonicMergeLocal = clCreateKernel(cpBitonicSort, "bitonicMergeLocal", &ciErrNum); - //oclCheckError(ciErrNum, CL_SUCCESS); - - //shrLog( "...checking minimum supported workgroup size\n"); - //Check for work group size - cl_device_id device; - size_t szBitonicSortLocal, szBitonicSortLocal1, szBitonicMergeLocal; - - ciErrNum = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL); - ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicSortLocal, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicSortLocal, NULL); - ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicSortLocal1, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicSortLocal1, NULL); - ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicMergeLocal, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicMergeLocal, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - - if( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ){ - //shrLog("\nERROR !!! Minimum work-group size %u required by this application is not supported on this device.\n\n", LOCAL_SIZE_LIMIT / 2); - closeBitonicSort(); - free(cBitonicSort); - //shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); - exit(EXIT_FAILURE); - } - - //Save default command queue - cqDefaultCommandQue = cqParamCommandQue; - - //Discard temp storage - free(cBitonicSort); -} - -extern "C" void closeBitonicSort(void) -{ - cl_int ciErrNum; - ciErrNum = clReleaseKernel(ckBitonicMergeLocal); - ciErrNum |= clReleaseKernel(ckBitonicMergeGlobal); - ciErrNum |= clReleaseKernel(ckBitonicSortLocal1); - ciErrNum |= clReleaseKernel(ckBitonicSortLocal); - ciErrNum |= clReleaseProgram(cpBitonicSort); - //oclCheckError(ciErrNum, CL_SUCCESS); -} - -static cl_uint factorRadix2(cl_uint& log2L, cl_uint L){ - if(!L){ - log2L = 0; - return 0; - }else{ - for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++); - return L; - } -} - -extern "C" size_t bitonicSort( - cl_command_queue cqCommandQueue, - cl_mem d_DstKey, - cl_mem d_DstVal, - cl_mem d_SrcKey, - cl_mem d_SrcVal, - uint batch, - uint arrayLength, - uint dir -){ - if(arrayLength < 2) - return 0; - - //Only power-of-two array lengths are supported so far - cl_uint log2L; - cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength); - //oclCheckError( factorizationRemainder == 1, shrTRUE ); - - if(!cqCommandQueue) - cqCommandQueue = cqDefaultCommandQue; - - dir = (dir != 0); - - cl_int ciErrNum; - size_t localWorkSize; - size_t globalWorkSize; - - if(arrayLength <= LOCAL_SIZE_LIMIT) - { - //oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); - //Launch bitonicSortLocal - ciErrNum = clSetKernelArg(ckBitonicSortLocal, 0, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 1, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 2, sizeof(cl_mem), (void *)&d_SrcKey); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 3, sizeof(cl_mem), (void *)&d_SrcVal); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 4, sizeof(cl_uint), (void *)&arrayLength); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 5, sizeof(cl_uint), (void *)&dir); - //oclCheckError(ciErrNum, CL_SUCCESS); - - localWorkSize = LOCAL_SIZE_LIMIT / 2; - globalWorkSize = batch * arrayLength / 2; - ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - } - else - { - //Launch bitonicSortLocal1 - ciErrNum = clSetKernelArg(ckBitonicSortLocal1, 0, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 1, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 2, sizeof(cl_mem), (void *)&d_SrcKey); - ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 3, sizeof(cl_mem), (void *)&d_SrcVal); - //oclCheckError(ciErrNum, CL_SUCCESS); - - localWorkSize = LOCAL_SIZE_LIMIT / 2; - globalWorkSize = batch * arrayLength / 2; - ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - - for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) - { - for(unsigned stride = size / 2; stride > 0; stride >>= 1) - { - if(stride >= LOCAL_SIZE_LIMIT) - { - //Launch bitonicMergeGlobal - ciErrNum = clSetKernelArg(ckBitonicMergeGlobal, 0, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 1, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 2, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 3, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 4, sizeof(cl_uint), (void *)&arrayLength); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 5, sizeof(cl_uint), (void *)&size); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 6, sizeof(cl_uint), (void *)&stride); - ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 7, sizeof(cl_uint), (void *)&dir); - //oclCheckError(ciErrNum, CL_SUCCESS); - - globalWorkSize = batch * arrayLength / 2; - ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeGlobal, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - } - else - { - //Launch bitonicMergeLocal - ciErrNum = clSetKernelArg(ckBitonicMergeLocal, 0, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 1, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 2, sizeof(cl_mem), (void *)&d_DstKey); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 3, sizeof(cl_mem), (void *)&d_DstVal); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 4, sizeof(cl_uint), (void *)&arrayLength); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 5, sizeof(cl_uint), (void *)&stride); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 6, sizeof(cl_uint), (void *)&size); - ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 7, sizeof(cl_uint), (void *)&dir); - //oclCheckError(ciErrNum, CL_SUCCESS); - - localWorkSize = LOCAL_SIZE_LIMIT / 2; - globalWorkSize = batch * arrayLength / 2; - - ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); - //oclCheckError(ciErrNum, CL_SUCCESS); - break; - } - } - } - } - return localWorkSize; -} diff --git a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h b/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h deleted file mode 100644 index 4d6cd650..00000000 --- a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -//////////////////////////////////////////////////////////////////////////////// -// Shortcut typename -//////////////////////////////////////////////////////////////////////////////// -typedef unsigned int uint; - -//////////////////////////////////////////////////////////////////////////////// -// Host-side validation routines -//////////////////////////////////////////////////////////////////////////////// -extern "C" int validateSortedKeys( - uint *result, - uint *data, - uint batch, - uint N, - uint numValues, - uint dir, - uint *srcHist, - uint *resHist -); - -extern "C" void fillValues( - uint *val, - uint N -); - -extern "C" int validateSortedValues( - uint *resKey, - uint *resVal, - uint *srcKey, - uint batchSize, - uint arrayLength -); - -extern "C" int validateValues( - uint *oval, - uint *okey, - uint N -); - -//////////////////////////////////////////////////////////////////////////////// -// OpenCL bitonic sort -//////////////////////////////////////////////////////////////////////////////// -#if 0 -extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv); - -extern "C" void closeBitonicSort(void); - -extern"C" size_t bitonicSort( - cl_command_queue cqCommandQueue, - cl_mem d_DstKey, - cl_mem d_DstVal, - cl_mem d_SrcKey, - cl_mem d_SrcVal, - uint batch, - uint arrayLength, - uint dir -); -#endif diff --git a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp b/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp deleted file mode 100644 index 59330087..00000000 --- a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -#include -#include "oclSortingNetworks_common.h" - -//////////////////////////////////////////////////////////////////////////////// -// Validate sorted keys array (check for integrity and proper order) -// returns 1 if correct/pass, returns 0 if incorrect/fail -//////////////////////////////////////////////////////////////////////////////// -extern "C" int validateSortedKeys( - uint *resKey, - uint *srcKey, - uint batch, - uint N, - uint numValues, - uint dir, - uint *srcHist, - uint *resHist -){ - shrLog("...validating sorted keys: "); - - if(N < 2){ - shrLog("arrayLength too short, exiting\n"); - return 1; - } - - for(uint j = 0; j < batch; j++, srcKey += N, resKey += N){ - memset(srcHist, 0, numValues * sizeof(uint)); - memset(resHist, 0, numValues * sizeof(uint)); - - //Build histograms for current array - for(uint i = 0; i < N; i++) - if( (srcKey[i] < numValues) && (resKey[i] < numValues) ){ - srcHist[srcKey[i]]++; - resHist[resKey[i]]++; - }else{ - shrLog("***Set %u key arrays are not limited properly***\n", j); - return 0; - } - - //Compare the histograms - for(uint i = 0; i < numValues; i++) - if(srcHist[i] != resHist[i]){ - shrLog("***Set %u key histograms do not match***\n", j); - return 0; - } - - //Check the ordering - for(uint i = 0; i < N - 1; i++) - if( (dir && (resKey[i] > resKey[i + 1])) || (!dir && (resKey[i] < resKey[i + 1])) ){ - shrLog("***Set %u key array is not ordered properly***\n", j); - return 0; - } - } - - //All checks passed - shrLog("OK\n"); - return 1; -} - -//////////////////////////////////////////////////////////////////////////////// -// Generate input values from input keys -// Check output values to match output keys by the same translation function -// returns 1 if correct/pass, returns 0 if incorrect/fail -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -// Value validation / stability check routines -//////////////////////////////////////////////////////////////////////////////// -extern "C" void fillValues( - uint *val, - uint N -){ - for(uint i = 0; i < N; i++) - val[i] = i; -} - -extern "C" int validateSortedValues( - uint *resKey, - uint *resVal, - uint *srcKey, - uint batchSize, - uint arrayLength -){ - int stableFlag = 1; - - shrLog( "...validating sorted values array: "); - for(uint j = 0; j < batchSize; j++, resKey += arrayLength, resVal += arrayLength){ - for(uint i = 0; i < arrayLength; i++){ - if( (resVal[i] < j * arrayLength) || (resVal[i] >= (j + 1) * arrayLength) || (resKey[i] != srcKey[resVal[i]]) ){ - shrLog("***corrupted!!!***\n"); - return 0; - } - - if( (i + 1 < arrayLength) && (resKey[i] == resKey[i + 1]) && (resVal[i] > resVal[i + 1]) ) - stableFlag = 0; - } - } - - shrLog("OK\n...stability property: %s\n\n", stableFlag ? "Stable" : "NOT stable !!!"); - return 1; -} diff --git a/rtpslib/system/SPH.cpp b/rtpslib/system/SPH.cpp index ff6a9e2f..c336c62c 100644 --- a/rtpslib/system/SPH.cpp +++ b/rtpslib/system/SPH.cpp @@ -94,6 +94,7 @@ namespace rtps hash = Hash(common_source_dir, ps->cli, timers["hash_gpu"]); bitonic = Bitonic(common_source_dir, ps->cli ); + radix = Radix(common_source_dir, ps->cli, max_num, 128); cellindices = CellIndices(common_source_dir, ps->cli, timers["ci_gpu"] ); permute = Permute( common_source_dir, ps->cli, timers["perm_gpu"] ); @@ -367,6 +368,7 @@ namespace rtps //defined in Sort.cpp timers["bitonic"]->start(); bitonic_sort(); + //radix_sort(); timers["bitonic"]->stop(); } @@ -865,6 +867,25 @@ namespace rtps //renderer->setParticleRadius(spacing*0.5); renderer->setParticleRadius(spacing); //renderer->setRTPS( + } + //---------------------------------------------------------------------- + void SPH::radix_sort() + { + try + { + int snum = nlpo2(num); + if(snum < 1024) + { + snum = 1024; + } + //printf("sorting snum: %d", snum); + radix.sort(snum, &cl_sort_hashes, &cl_sort_indices); + } + catch (cl::Error er) + { + printf("ERROR(radix sort): %s(%s)\n", er.what(), oclErrorString(er.err())); + } + } //---------------------------------------------------------------------- void SPH::bitonic_sort() diff --git a/rtpslib/system/SPH.h b/rtpslib/system/SPH.h index 75722b6a..789c3d2d 100644 --- a/rtpslib/system/SPH.h +++ b/rtpslib/system/SPH.h @@ -27,6 +27,7 @@ class OUTER; //#include #include #include +#include //#include #include #include // contains CloudPermute @@ -183,6 +184,7 @@ namespace rtps Buffer cl_sort_output_indices; Bitonic bitonic; + Radix radix; //Parameter structs Buffer cl_sphp; @@ -223,6 +225,7 @@ namespace rtps void hash_and_sort(); void cloud_hash_and_sort(); // GE void bitonic_sort(); + void radix_sort(); void cloud_bitonic_sort(); // GE Density density; Force force; diff --git a/rtpslib/system/common/Radix.cpp b/rtpslib/system/common/Radix.cpp new file mode 100644 index 00000000..b2ce20ee --- /dev/null +++ b/rtpslib/system/common/Radix.cpp @@ -0,0 +1,317 @@ + +/** + * C++ port of NVIDIA's Radix Sort implementation with Key-Value instead of Keys Only + */ + + +template +Radix::Radix(std::string source_dir, CL *cli, int max_elements, int cta_size ) +{ + this->cli = cli; + + WARP_SIZE = 128;//32 + SCAN_WG_SIZE = 128;//256 + MIN_LARGE_ARRAY_SIZE = 4 * SCAN_WG_SIZE; + bit_step = 4; + //maybe cta_size should be passed to the call instead of the constructor + this->cta_size = cta_size; + uintsz = sizeof(T); + + loadKernels(source_dir); + + int num_blocks; + if ((max_elements % (cta_size * 4)) == 0) + { + num_blocks = max_elements / (cta_size * 4); + } + else + { + num_blocks = max_elements / (cta_size * 4) + 1; + } + + vector tmp(max_elements); + d_tempKeys = Buffer(cli, tmp); + d_tempValues = Buffer(cli, tmp); + + tmp.resize(WARP_SIZE * num_blocks); + mCounters = Buffer(cli, tmp); + mCountersSum = Buffer(cli, tmp); + mBlockOffsets = Buffer(cli, tmp); + + int numscan = max_elements/2/cta_size*16; + if (numscan >= MIN_LARGE_ARRAY_SIZE) + { + //#MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024 + tmp.resize(numscan / 1024); + scan_buffer = Buffer(cli, tmp); + } + + + +} + +template +void Radix::loadKernels(std::string source_dir) +{ + string radix_source = source_dir + "/RadixSort.cl"; + + std::string options = "-D LOCAL_SIZE_LIMIT=512"; + cl::Program prog = cli->loadProgram(radix_source, options); + //printf("radix sort\n"); + + //printf("load scanNaive\n"); + k_scanNaive = Kernel(cli, prog, "scanNaive"); + //printf("load radixSortBlockKeysValues\n"); + //k_radixSortBlockKeysValues = Kernel(cli, prog, "radixSortBlockKeysValues"); + //printf("load radixSortBlocksKeysValues\n"); + k_radixSortBlocksKeysValues = Kernel(cli, prog, "radixSortBlocksKeysValues"); + //printf("load reorderDataKeysValues\n"); + k_reorderDataKeysValues = Kernel(cli, prog, "reorderDataKeysValues"); + //printf("load findRadixOffsets\n"); + k_findRadixOffsets = Kernel(cli, prog, "findRadixOffsets"); + + string scan_source = source_dir + "/Scan_b.cl"; + + options = "-D LOCAL_SIZE_LIMIT=512"; + prog = cli->loadProgram(scan_source, options); + + k_scanExclusiveLocal1 = Kernel(cli, prog, "scanExclusiveLocal1"); + k_scanExclusiveLocal2 = Kernel(cli, prog, "scanExclusiveLocal2"); + k_uniformUpdate = Kernel(cli, prog, "uniformUpdate"); + + + + //TODO: implement this check with the C++ API + //if( (szRadixSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szRadixSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) || (szRadixMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ){ + //shrLog("\nERROR !!! Minimum work-group size %u required by this application is not supported on this device.\n\n", LOCAL_SIZE_LIMIT / 2); +} + +template +void Radix::sort(int num, Buffer* keys, Buffer* values) +{ + //printf("radix sort routine\n"); + this->keys = keys; + this->values = values; + int key_bits = sizeof(T) * 8; + //printf("sorting %d\n", num); + //printf("key_bits %d\n", key_bits); + //printf("bit_step %d\n", bit_step); + + int i = 0; + while(key_bits > i*bit_step) + { + //printf("i*bit_step %d\n", i*bit_step) + step(bit_step, i*bit_step, num); + i += 1; + } + cli->queue.finish(); +} + +template +void Radix::step(int nbits, int startbit, int num) +{ + blocks(nbits, startbit, num); + cli->queue.finish(); + + find_offsets(startbit, num); + cli->queue.finish(); + + int array_length = num/2/cta_size*16; + if(array_length < MIN_LARGE_ARRAY_SIZE) + { + naive_scan(num); + } + else + { + scan(&mCountersSum, &mCounters, 1, array_length); + } + cli->queue.finish(); + + reorder(startbit, num); + cli->queue.finish(); +} + +template +void Radix::blocks(int nbits, int startbit, int num) +{ + int totalBlocks = num/4/cta_size; + int global_size = cta_size*totalBlocks; + int local_size = cta_size; + + int arg = 0; + k_radixSortBlocksKeysValues.setArg(arg++, keys->getDevicePtr()); + k_radixSortBlocksKeysValues.setArg(arg++, values->getDevicePtr()); + k_radixSortBlocksKeysValues.setArg(arg++, d_tempKeys.getDevicePtr()); + k_radixSortBlocksKeysValues.setArg(arg++, d_tempValues.getDevicePtr()); + k_radixSortBlocksKeysValues.setArg(arg++, nbits); + k_radixSortBlocksKeysValues.setArg(arg++, startbit); + k_radixSortBlocksKeysValues.setArg(arg++, num); + k_radixSortBlocksKeysValues.setArg(arg++, totalBlocks); + k_radixSortBlocksKeysValues.setArgShared(arg++, 4 * cta_size * sizeof(T)); + k_radixSortBlocksKeysValues.setArgShared(arg++, 4 * cta_size * sizeof(T)); + + k_radixSortBlocksKeysValues.execute(global_size, local_size); + +} +template +void Radix::find_offsets(int startbit, int num) +{ + int totalBlocks = num/2/cta_size; + int global_size = cta_size*totalBlocks; + int local_size = cta_size; + int arg = 0; + k_findRadixOffsets.setArg(arg++, d_tempKeys.getDevicePtr()); + k_findRadixOffsets.setArg(arg++, d_tempValues.getDevicePtr()); + k_findRadixOffsets.setArg(arg++, mCounters.getDevicePtr()); + k_findRadixOffsets.setArg(arg++, mBlockOffsets.getDevicePtr()); + k_findRadixOffsets.setArg(arg++, startbit); + k_findRadixOffsets.setArg(arg++, num); + k_findRadixOffsets.setArg(arg++, totalBlocks); + k_findRadixOffsets.setArgShared(arg++, 2 * cta_size * sizeof(T)); + + k_findRadixOffsets.execute(global_size, local_size); +} + +template +void Radix::naive_scan(int num) +{ + int nhist = num/2/cta_size*16; + int global_size = nhist; + int local_size = nhist; + int extra_space = nhist / 16;// #NUM_BANKS defined as 16 in RadixSort.cpp (original NV implementation) + int shared_mem_size = sizeof(T) * (nhist + extra_space); + int arg = 0; + k_scanNaive.setArg(arg++, mCountersSum.getDevicePtr()); + k_scanNaive.setArg(arg++, mCounters.getDevicePtr()); + k_scanNaive.setArg(arg++, nhist); + k_scanNaive.setArgShared(arg++, 2*shared_mem_size); + + k_scanNaive.execute(global_size, local_size); +} +template +void Radix::scan( Buffer* dst, Buffer* src, int batch_size, int array_length) +{ + scan_local1(dst, + src, + batch_size * array_length / (4 * SCAN_WG_SIZE), + 4 * SCAN_WG_SIZE); + + cli->queue.finish(); + scan_local2(dst, + src, + batch_size, + array_length / (4 * SCAN_WG_SIZE)); + cli->queue.finish(); + scan_update(dst, batch_size * array_length / (4 * SCAN_WG_SIZE)); + cli->queue.finish(); + +} + +template +void Radix::scan_local1( Buffer* dst, Buffer* src, int n, int size) +{ + int global_size = n * size / 4; + int local_size = SCAN_WG_SIZE; + int arg = 0; + k_scanExclusiveLocal1.setArg(arg++, dst->getDevicePtr()); + k_scanExclusiveLocal1.setArg(arg++, src->getDevicePtr()); + k_scanExclusiveLocal1.setArgShared(arg++, 2 * SCAN_WG_SIZE * sizeof(T)); + k_scanExclusiveLocal1.setArg(arg++, size); + + k_scanExclusiveLocal1.execute(global_size, local_size); +} + +template +void Radix::scan_local2( Buffer* dst, Buffer* src, int n, int size) +{ + int elements = n * size; + int dividend = elements; + int divisor = SCAN_WG_SIZE; + int global_size; + if (dividend % divisor == 0) + global_size = dividend; + else + global_size = dividend - dividend % divisor + divisor; + int local_size = SCAN_WG_SIZE; + + int arg = 0; + k_scanExclusiveLocal2.setArg(arg++, scan_buffer.getDevicePtr()); + k_scanExclusiveLocal2.setArg(arg++, dst->getDevicePtr()); + k_scanExclusiveLocal2.setArg(arg++, src->getDevicePtr()); + k_scanExclusiveLocal2.setArgShared(arg++, 2 * SCAN_WG_SIZE * sizeof(T)); + k_scanExclusiveLocal2.setArg(arg++, elements); + k_scanExclusiveLocal2.setArg(arg++, size); + k_scanExclusiveLocal2.execute(global_size, local_size); +} + +template +void Radix::scan_update( Buffer* dst, int n) +{ + int global_size = n * SCAN_WG_SIZE; + int local_size = SCAN_WG_SIZE; + int arg = 0; + k_uniformUpdate.setArg(arg++, dst->getDevicePtr()); + k_uniformUpdate.setArg(arg++, scan_buffer.getDevicePtr()); + k_uniformUpdate.execute(global_size, local_size); +} + +template +void Radix::reorder( int startbit, int num) +{ + int totalBlocks = num/2/cta_size; + int global_size = cta_size*totalBlocks; + int local_size = cta_size; + int arg = 0; + k_reorderDataKeysValues.setArg(arg++, keys->getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, values->getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, d_tempKeys.getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, d_tempValues.getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, mBlockOffsets.getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, mCountersSum.getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, mCounters.getDevicePtr()); + k_reorderDataKeysValues.setArg(arg++, startbit); + k_reorderDataKeysValues.setArg(arg++, num); + k_reorderDataKeysValues.setArg(arg++, totalBlocks); + k_reorderDataKeysValues.setArgShared(arg++, 2 * cta_size * sizeof(T)); + k_reorderDataKeysValues.setArgShared(arg++, 2 * cta_size * sizeof(T)); + k_reorderDataKeysValues.execute(global_size, local_size); +} + + +#if 0 +if __name__ == "__main__": + + n = 1048576*2 + #n = 32768*2 + #n = 16384 + #n = 8192 + hashes = np.ndarray((n,1), dtype=np.uint32) + indices = np.ndarray((n,1), dtype=np.uint32) + + for i in xrange(0,n): + hashes[i] = n - i + indices[i] = i + + npsorted = np.sort(hashes,0) + + print "hashes before:", hashes[0:20].T + print "indices before: ", indices[0:20].T + + radix = Radix(n, 128, hashes.dtype) + #num_to_sort = 32768 + num_to_sort = n + hashes, indices = radix.sort(num_to_sort, hashes, indices) + + print "hashes after:", hashes[0:20].T + print "indices after: ", indices[0:20].T + + print np.linalg.norm(hashes - npsorted) + + print timings + + +#endif + + + + diff --git a/rtpslib/system/common/Radix.h b/rtpslib/system/common/Radix.h new file mode 100644 index 00000000..1c348a42 --- /dev/null +++ b/rtpslib/system/common/Radix.h @@ -0,0 +1,77 @@ +#ifndef RTPS_RADIX_SORT_H +#define RTPS_RADIX_SORT_H + +#include "CLL.h" +#include "Kernel.h" +#include "Buffer.h" + +#include +using namespace std; + +#ifndef uint +#define uint unsigned int +#endif + + +namespace rtps +{ + +template +class Radix +{ +public: + static const uint LOCAL_SIZE_LIMIT = 512U; + Radix(){ cli=NULL; }; + //create an OpenCL buffer from existing data + Radix( std::string source_dir, CL *cli, int max_elements, int cta_size ); + + void loadKernels(std::string source_dir); + + void sort(int num, Buffer* keys, Buffer* values); + void step(int nbits, int startbit, int num); + void blocks(int nbits, int startbit, int num); + void find_offsets(int startbit, int num); + void naive_scan(int num); + + void scan( Buffer* dst, Buffer* src, int batch_size, int array_length); + void scan_local1( Buffer* dst, Buffer* src, int n, int size); + void scan_local2( Buffer* dst, Buffer* src, int n, int size); + void scan_update( Buffer* dst, int n); + void reorder( int startbit, int num); + +private: + Kernel k_scanNaive; + //Kernel k_radixSortBlockKeysValues; + Kernel k_radixSortBlocksKeysValues; + Kernel k_reorderDataKeysValues; + Kernel k_findRadixOffsets; + Kernel k_scanExclusiveLocal1; + Kernel k_scanExclusiveLocal2; + Kernel k_uniformUpdate; + + + CL *cli; + + int WARP_SIZE; + int SCAN_WG_SIZE; + int MIN_LARGE_ARRAY_SIZE; + int bit_step; + int cta_size; + size_t uintsz; + + Buffer* keys; + Buffer* values; + //these should probably be type T + Buffer d_tempKeys; + Buffer d_tempValues; + Buffer mCounters; + Buffer mCountersSum; + Buffer mBlockOffsets; + Buffer scan_buffer; + +}; + + +#include "Radix.cpp" +} +#endif diff --git a/rtpslib/system/common/cl_src/RadixSort.cl b/rtpslib/system/common/cl_src/RadixSort.cl new file mode 100644 index 00000000..895fb2d9 --- /dev/null +++ b/rtpslib/system/common/cl_src/RadixSort.cl @@ -0,0 +1,630 @@ +/* +* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. +* +* NVIDIA Corporation and its licensors retain all intellectual property and +* proprietary rights in and to this software and related documentation. +* Any use, reproduction, disclosure, or distribution of this software +* and related documentation without an express license agreement from +* NVIDIA Corporation is strictly prohibited. +* +* Please refer to the applicable NVIDIA end user license agreement (EULA) +* associated with this source code for terms and conditions that govern +* your use of this NVIDIA software. +* +*/ + +//---------------------------------------------------------------------------- +// Scans each warp in parallel ("warp-scan"), one element per thread. +// uses 2 numElements of shared memory per thread (64 = elements per warp) +//---------------------------------------------------------------------------- +#define WARP_SIZE 32 +uint scanwarp(uint val, volatile __local uint* sData, int maxlevel) +{ + // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = + // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) + int localId = get_local_id(0); + int idx = 2 * localId - (localId & (WARP_SIZE - 1)); + sData[idx] = 0; + idx += WARP_SIZE; + sData[idx] = val; + + if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } + if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } + if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } + if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } + if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } + + return sData[idx] - val; // convert inclusive -> exclusive +} + +//---------------------------------------------------------------------------- +// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using +// a warp-scan algorithm +//---------------------------------------------------------------------------- +uint4 scan4(uint4 idata, __local uint* ptr) +{ + + uint idx = get_local_id(0); + + uint4 val4 = idata; + uint sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + uint val = val4.w + sum[2]; + + val = scanwarp(val, ptr, 4); + barrier(CLK_LOCAL_MEM_FENCE); + + if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) + { + ptr[idx >> 5] = val + val4.w + sum[2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (idx < WARP_SIZE) + ptr[idx] = scanwarp(ptr[idx], ptr, 2); + + barrier(CLK_LOCAL_MEM_FENCE); + + val += ptr[idx >> 5]; + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +#ifdef __APPLE__ +__kernel uint4 rank4(uint4 preds, __local uint* sMem) +#else +uint4 rank4(uint4 preds, __local uint* sMem) +#endif +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + uint4 address = scan4(preds, sMem); + + __local uint numtrue[1]; + if (localId == localSize - 1) + { + numtrue[0] = address.w + preds.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 rank; + int idx = localId*4; + rank.x = (preds.x) ? address.x : numtrue[0] + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue[0] + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue[0] + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue[0] + idx + 3 - address.w; + + return rank; +} + +void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem) +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !(((*key).x >> shift) & 0x1); + lsb.y = !(((*key).y >> shift) & 0x1); + lsb.z = !(((*key).z >> shift) & 0x1); + lsb.w = !(((*key).w >> shift) & 0x1); + + uint4 r; + + r = rank4(lsb, sMem); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w; + barrier(CLK_LOCAL_MEM_FENCE); + + // The above allows us to read without 4-way bank conflicts: + (*key).x = sMem[localId]; + (*key).y = sMem[localId + localSize]; + (*key).z = sMem[localId + 2 * localSize]; + (*key).w = sMem[localId + 3 * localSize]; + + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, + __global uint4* keysOut, + uint nbits, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sMem) +{ + int globalId = get_global_id(0); + + uint4 key; + key = keysIn[globalId]; + + barrier(CLK_LOCAL_MEM_FENCE); + + radixSortBlockKeysOnly(&key, nbits, startbit, sMem); + + keysOut[globalId] = key; +} + +//---------------------------------------------------------------------------- +// Given an array with blocks sorted according to a 4-bit radix group, each +// block counts the number of keys that fall into each radix in the group, and +// finds the starting offset of each radix in the block. It then writes the radix +// counts to the counters array, and the starting offsets to the blockOffsets array. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +// +//---------------------------------------------------------------------------- +#if 0 //KeysOnly version +__kernel void findRadixOffsets(__global uint2* keys, + __global uint* counters, + __global uint* blockOffsets, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sRadix1) +{ + __local uint sStartPointers[16]; + + uint groupId = get_group_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + uint2 radix2; + + radix2 = keys[get_global_id(0)]; + + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + { + sStartPointers[localId] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId]] = localId; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + { + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + { + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + } + + + if(localId == groupSize - 1) + { + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; + } +} +#endif + +// a naive scan routine that works only for array that +// can fit into a single block, just for debugging purpose, +// not used in the sort now +__kernel void scanNaive(__global uint *g_odata, + __global uint *g_idata, + uint n, + __local uint* temp) +{ + + int localId = get_local_id(0); + + int pout = 0; + int pin = 1; + + // Cache the computational window in shared memory + temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0; + + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; + pin = 1 - pout; + barrier(CLK_LOCAL_MEM_FENCE); + + temp[pout*n+localId] = temp[pin*n+localId]; + + if (localId >= offset) + temp[pout*n+localId] += temp[pin*n+localId - offset]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + g_odata[localId] = temp[pout*n+localId]; +} + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +__kernel void reorderDataKeysOnly(__global uint *outKeys, + __global uint2 *keys, + __global uint *blockOffsets, + __global uint *offsets, + __global uint *sizes, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint2* sKeys2) +{ + __local uint sOffsets[16]; + __local uint sBlockOffsets[16]; + + __local uint *sKeys1 = (__local uint*)sKeys2; + + uint groupId = get_group_id(0); + + uint globalId = get_global_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + sKeys2[localId] = keys[globalId]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint radix = (sKeys1[localId] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId]; + } + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId + groupSize]; + } + + +} + +//---------------------------------------------------------------------- +//#include "Radix_Keys_Values.cl" +//---------------------------------------------------------------------- + +void radixSortBlockKeysValues(uint4 *key, uint4 *value, uint nbits, uint startbit, __local uint* sMem, __local uint* sVMem) +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !(((*key).x >> shift) & 0x1); + lsb.y = !(((*key).y >> shift) & 0x1); + lsb.z = !(((*key).z >> shift) & 0x1); + lsb.w = !(((*key).w >> shift) & 0x1); + + uint4 r; + uint4 rv; + + r = rank4(lsb, sMem); + rv = rank4(lsb, sVMem); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w; + + sVMem[(rv.x & 3) * localSize + (rv.x >> 2)] = (*value).x; + sVMem[(rv.y & 3) * localSize + (rv.y >> 2)] = (*value).y; + sVMem[(rv.z & 3) * localSize + (rv.z >> 2)] = (*value).z; + sVMem[(rv.w & 3) * localSize + (rv.w >> 2)] = (*value).w; + + barrier(CLK_LOCAL_MEM_FENCE); + + // The above allows us to read without 4-way bank conflicts: + (*key).x = sMem[localId]; + (*key).y = sMem[localId + localSize]; + (*key).z = sMem[localId + 2 * localSize]; + (*key).w = sMem[localId + 3 * localSize]; + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + // The above allows us to read without 4-way bank conflicts: + (*value).x = sVMem[localId]; + (*value).y = sVMem[localId + localSize]; + (*value).z = sVMem[localId + 2 * localSize]; + (*value).w = sVMem[localId + 3 * localSize]; + barrier(CLK_LOCAL_MEM_FENCE); + + // I AM NOT CLEAR ON HOW TO HANDLE value ARRAY + } + //better not put things in seperate for-loops + //i don't quite understand what the radix sort is doing but the rank4 update + // seems to be using memory addresses to do calculations and we want to couple + // our values to our keys as closely as possible +/* + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !(((*key).x >> shift) & 0x1); + lsb.y = !(((*key).y >> shift) & 0x1); + lsb.z = !(((*key).z >> shift) & 0x1); + lsb.w = !(((*key).w >> shift) & 0x1); + + uint4 r; + + r = rank4(lsb, sVMem); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sVMem[(r.x & 3) * localSize + (r.x >> 2)] = (*value).x; + sVMem[(r.y & 3) * localSize + (r.y >> 2)] = (*value).y; + sVMem[(r.z & 3) * localSize + (r.z >> 2)] = (*value).z; + sVMem[(r.w & 3) * localSize + (r.w >> 2)] = (*value).w; + barrier(CLK_LOCAL_MEM_FENCE); + + // The above allows us to read without 4-way bank conflicts: + (*value).x = sVMem[localId]; + (*value).y = sVMem[localId + localSize]; + (*value).z = sVMem[localId + 2 * localSize]; + (*value).w = sVMem[localId + 3 * localSize]; + barrier(CLK_LOCAL_MEM_FENCE); + } + */ +} + +//---------------------------------------------------------------------- +// ERROR: +__kernel void radixSortBlocksKeysValues(__global uint4* keysIn, + __global uint4* valuesIn, + __global uint4* keysOut, + __global uint4* valuesOut, + uint nbits, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sMem, + __local uint* sVMem) +{ + int globalId = get_global_id(0); + + uint4 key; + key = keysIn[globalId]; + + uint4 value; + value = valuesIn[globalId]; // not sure if needed + + barrier(CLK_LOCAL_MEM_FENCE); + + radixSortBlockKeysValues(&key, &value, nbits, startbit, sMem, sVMem); + + keysOut[globalId] = key; + valuesOut[globalId] = value; + //valuesOut[globalId] = 3; // works +} + + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +__kernel void reorderDataKeysValues(__global uint *outKeys, + __global uint *outValues, + __global uint2 *keys, + __global uint2 *values, + __global uint *blockOffsets, + __global uint *offsets, + __global uint *sizes, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint2* sKeys2, // Enough shared memory on card? + __local uint2* sValues2) // Do I need shared for values? +{ + __local uint sOffsets[16]; + __local uint sBlockOffsets[16]; + + __local uint *sKeys1 = (__local uint*) sKeys2; + __local uint *sValues1 = (__local uint*) sValues2; + + uint groupId = get_group_id(0); + + uint globalId = get_global_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + sKeys2[localId] = keys[globalId]; + sValues2[localId] = values[globalId]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint radix = (sKeys1[localId] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId]; + outValues[globalOffset] = sValues1[localId]; + } + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId + groupSize]; + outValues[globalOffset] = sValues1[localId + groupSize]; + } +} +//---------------------------------------------------------------------- +#if 1 //keys values +__kernel void findRadixOffsets(__global uint2* keys, + __global uint2* values, + __global uint* counters, + __global uint* blockOffsets, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sRadix1) +{ + __local uint sStartPointers[16]; + + uint groupId = get_group_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + uint2 radix2; + + radix2 = keys[get_global_id(0)]; + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + { + sStartPointers[localId] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId]] = localId; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + { + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + { + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + } + + + if(localId == groupSize - 1) + { + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; + } +} +//---------------------------------------------------------------------- +#endif diff --git a/rtpslib/system/common/cl_src/Scan_b.cl b/rtpslib/system/common/cl_src/Scan_b.cl new file mode 100644 index 00000000..32fd4dd4 --- /dev/null +++ b/rtpslib/system/common/cl_src/Scan_b.cl @@ -0,0 +1,190 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//All three kernels run 512 threads per workgroup +//Must be a power of two +#define WORKGROUP_SIZE 256 + + + +//////////////////////////////////////////////////////////////////////////////// +// Scan codelets +//////////////////////////////////////////////////////////////////////////////// +#if(1) + //Naive inclusive scan: O(N * log2(N)) operations + //Allocate 2 * 'size' local memory, initialize the first half + //with 'size' zeros avoiding if(pos >= offset) condition evaluation + //and saving instructions + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint t = l_Data[pos] + l_Data[pos - offset]; + barrier(CLK_LOCAL_MEM_FENCE); + l_Data[pos] = t; + } + + return l_Data[pos]; + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } + +#else + #define LOG2_WARP_SIZE 5U + #define WARP_SIZE (1U << LOG2_WARP_SIZE) + + //Almost the same as naiveScan1 but doesn't need barriers + //assuming size <= WARP_SIZE + inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1) + l_Data[pos] += l_Data[pos - offset]; + + return l_Data[pos]; + } + + inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){ + return warpScanInclusive(idata, l_Data, size) - idata; + } + + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + if(size > WARP_SIZE){ + //Bottom-level inclusive warp scan + uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because l_Data is being overwritten) + barrier(CLK_LOCAL_MEM_FENCE); + if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) + l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult; + + //wait for warp scans to complete + barrier(CLK_LOCAL_MEM_FENCE); + if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){ + //grab top warp elements + uint val = l_Data[get_local_id(0)]; + //calculate exclsive scan and write back to shared memory + l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE); + } + + //return updated warp scans with exclusive scan results + barrier(CLK_LOCAL_MEM_FENCE); + return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE]; + }else{ + return warpScanInclusive(idata, l_Data, size); + } + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } +#endif + + +//Vector scan: the array to be scanned is stored +//in work-item private memory as uint4 +inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){ + //Level-0 inclusive scan + data4.y += data4.x; + data4.z += data4.y; + data4.w += data4.z; + + //Level-1 exclusive scan + uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w; + + return (data4 + (uint4)val); +} + +inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){ + return scan4Inclusive(data4, l_Data, size) - data4; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Scan kernels +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal1( + __global uint4 *d_Dst, + __global uint4 *d_Src, + __local uint* l_Data, + uint size +){ + //Load data + uint4 idata4 = d_Src[get_global_id(0)]; + + //Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, l_Data, size); + + //Write back + d_Dst[get_global_id(0)] = odata4; +} + +//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal2( + __global uint *d_Buf, + __global uint *d_Dst, + __global uint *d_Src, + __local uint* l_Data, + uint N, + uint arrayLength +){ + //Load top elements + //Convert results of bottom-level scan back to inclusive + //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N) + uint data = 0; + if(get_global_id(0) < N) + data = + d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + + d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)]; + + //Compute + uint odata = scan1Exclusive(data, l_Data, arrayLength); + + //Avoid out-of-bound access + if(get_global_id(0) < N) + d_Buf[get_global_id(0)] = odata; +} + +//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void uniformUpdate( + __global uint4 *d_Data, + __global uint *d_Buf +){ + __local uint buf[1]; + + uint4 data4 = d_Data[get_global_id(0)]; + + if(get_local_id(0) == 0) + buf[0] = d_Buf[get_group_id(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + data4 += (uint4)buf[0]; + d_Data[get_global_id(0)] = data4; +}