From a26fb009e8c0ca6cb2c2fae3a36cb60bde6a6ed3 Mon Sep 17 00:00:00 2001
From: enjalot <enjalot+github@gmail.com>
Date: Wed, 3 Aug 2011 15:13:51 -0400
Subject: [PATCH] added radix but seems to be slower than bitonic on mac with
 few particles

---
 rtpslib/opencl/bitonic_sort/Makefile          |  51 --
 .../bitonic_sort/obj/release/main.cpp.o       | Bin 4960 -> 0 bytes
 .../obj/release/oclBitonicSort_launcher.cpp.o | Bin 7476 -> 0 bytes
 .../release/oclSortingNetworks_validate.cpp.o | Bin 2884 -> 0 bytes
 rtpslib/opencl/bitonic_sort/src/main.cpp      | 190 ------
 .../src/oclBitonicSort_launcher.cpp           | 230 -------
 .../src/oclSortingNetworks_common.h           |  72 --
 .../src/oclSortingNetworks_validate.cpp       | 114 ----
 rtpslib/system/SPH.cpp                        |  21 +
 rtpslib/system/SPH.h                          |   3 +
 rtpslib/system/common/Radix.cpp               | 317 +++++++++
 rtpslib/system/common/Radix.h                 |  77 +++
 rtpslib/system/common/cl_src/RadixSort.cl     | 630 ++++++++++++++++++
 rtpslib/system/common/cl_src/Scan_b.cl        | 190 ++++++
 14 files changed, 1238 insertions(+), 657 deletions(-)
 delete mode 100644 rtpslib/opencl/bitonic_sort/Makefile
 delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o
 delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o
 delete mode 100644 rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o
 delete mode 100644 rtpslib/opencl/bitonic_sort/src/main.cpp
 delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp
 delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h
 delete mode 100644 rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp
 create mode 100644 rtpslib/system/common/Radix.cpp
 create mode 100644 rtpslib/system/common/Radix.h
 create mode 100644 rtpslib/system/common/cl_src/RadixSort.cl
 create mode 100644 rtpslib/system/common/cl_src/Scan_b.cl

diff --git a/rtpslib/opencl/bitonic_sort/Makefile b/rtpslib/opencl/bitonic_sort/Makefile
deleted file mode 100644
index 8777dbf0..00000000
--- a/rtpslib/opencl/bitonic_sort/Makefile
+++ /dev/null
@@ -1,51 +0,0 @@
-################################################################################
-#
-# Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
-#
-# NOTICE TO USER:   
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and 
-# international Copyright laws.  
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
-# OR PERFORMANCE OF THIS SOURCE CODE.  
-#
-# U.S. Government End Users.  This source code is a "commercial item" as 
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
-# "commercial computer software" and "commercial computer software 
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
-# and is provided to the U.S. Government only as a commercial end item.  
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
-# source code with only those rights set forth herein.
-#
-################################################################################
-#
-# Build script for project
-#
-################################################################################
-
-# Add source files here
-EXECUTABLE	:= oclSortingNetworks
-# C/C++ source files (compiled with gcc / c++)
-SRCDIR		:= src/
-CCFILES		:= main.cpp oclSortingNetworks_validate.cpp oclBitonicSort_launcher.cpp
-
-################################################################################
-# Rules and targets
-
-# NOTE: Assuming P4 based install until OpenCL becomes public
-# Adjust P4_Root, default is ${HOME}/perforce/
-# P4_ROOT=${HOME}/myperforce/
-
-include ../../common/common_opencl.mk
-
-
diff --git a/rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/main.cpp.o
deleted file mode 100644
index 729e9bfa4c9862b88450e87b8b50d35399830401..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4960
zcmb7Ie{2-T6&{ldHir0|wn?GQHEXb2Xt2Yj5NMS|K732p${8GNr$h>Dj`P@e$^9^U
zdtj$QBEr{tbU1aDQh`t?O8zLSKvknq5hB2XDi)R7q*Q81k!VN?4WUU%)j&!luHV~z
z2X_mk?Z~s=esA8qA2YMN@BGH)53f`UQFV*J1)Z>U3N9Kt+AQ=FG;otPR^tvlL37Yb
z95w~@TS<s!nzkIY-FugJ>#Mr2@zUMTr@D7!&xhL!wL+{Vkxy(QC^gfJWo;{w?q`8%
zR@&Rmb`-~6GtJGdG@rvQQN$<XW}g*JNmt~xw}kUt#WS`cE?O&5=jJ$Fh&T^9fcP?L
zM3{@&84D7xz0x|B=1`uW*G#ilmYMsX?0wF*3Ce#a`dq|+AJDiR975zFI?<_pkvr+$
zfgV8re_M~C&V;@fdNkA%>h3b0{ri3xPzRqW7K?`3H$X7@AlZDME=D2mhzbi>8XR}!
zt`4)qLbSU2?dtVw#8CM->{SS!@CsIu;B6dC$F6f}Di(zZsj08ga*dUXd^?TeQRCUq
zgs2y}vxf+b!a;I!gQ$$rjc%~&sM06%<No~PIKPq6)PAF|aa<3MN7T6k<l+z=YsbJ<
z%_)wioa|7ip%^?CQ6C&4)gU`k`liC_$^1!w{%$-^hMF)6x&0A!`T*^W8;jCMSmm!U
z=2^bNR(Dx*BZy_w4TA4b??lwc2WVjrHNxgKHKmWS-7xGz&nS%23fx$C8|${?(SUC7
zn5#y)l$+eeY7MT*8>+Rd3D$ZK#;~itQl|M%cUP|o3<xIl-Q*^8uoc${eGjhU{H)+O
z<aDc7eOT1@Y~HxX?;q}mBF<sG2p$hz`Xr=&qmG2tN0n<f5pJv@dto)dQCFuU1$~=Q
z=opKr%UJteyfML(1~z)@(aoP#`SbsRx`Ao+<c@A?=w`x4>nS3IQ96quxNQ~4mrRo(
ztjdV9A6L)9<bN?kC%D2)L|wVw$_}zp*h&jysP4uZJXy4AhVnIrUM1&bq6*oa3#)e{
z>LSiH#mAxO>isZsR{t~#`^aDUXrX6!{<wemr7OiE4&?9-Laf2K|JQG-p50~3oZpTi
zV{{RvKRj-XI$r|AAHGnz&>mJeNO3xGz$?Y?+&h<u;sEtYWW39fO>^$ImEF6Kuq$`h
z*1dPny*fI5^`vqhrr|##GW8eNv>V>5FGFOgBkGhVtWkRZiQvV<a|p(rIXpFga{tFC
ziba3;gvVAIs^?!;6BQ1AOu`pD!skgi;t_ra!gKW8ddxmda@rM-{{19xg&gZf!;ahX
zox?pnxo$Y>k~{7uhJ7Ya1?R==4CKdi4aUs}{lk0V?&|xxhE*#U$SAmbURdIo<cG$S
zPCJFIa8x|<6(nDGy}X0uk6kZcO7hV4@`p%15Aw<oors*vxgKOLGO=V=#<EdRESL6%
zjP+Eu#YVl-l-0WIs3UJ_X{oIh?dgP_h$a(HLsRRG+EEf$N>>cXbX&xcXRX-1sc0hI
z5*rvm9B8##Lqmh6TUN%>=4ed`ZEjO`o))#WWFjqz&({)ZtuK)T$&xO7>Vxo^?!TEQ
zZEeENq!TeMiv*<QsEIa9Mro4fMx&)<Dq{_4nYB;Kn4KjZx8UeTRiQ;KD>|elWxC&v
zYfU-ClyhZS3dW90ZEe(!#pgMuJ40PvdRQzAwMPI=Ss3by=WNGWZ)QW<QA%e9tW0Xa
z7VDxmJc!y(OJ=j$19}<}KBU<h?U7u<c2tpOd4kR=32_%=)zh6TMb%*a)E)R%i251T
zHDmPp5)08u@+mftTB&XnJ{qp8|G^X=m1nn2_0juybM=;~qqjci`_U~sQ5QA|(Yzhl
zjK5JJGy=~6VK0ya&V{^}`D*47<~s9*%$u3t#rzKDb-)JLsRqsl7V#+DDa3=H0?F?O
zK$Kj8KLGCmKMeGPzry@QAWE~q4j{fD&CfIbkn#JB5=e2c0aDx@%$G4=#Jq*M#(Wm@
z>CAo1FTxnv`v)#$?-Y>ocne5z9|Iz#z!Bhl@YjHt8u$eee}R{9p?O<?UjuF;A<zci
z4U98VaYVe6(Ew`TZNNt0V#a%b<fjpM8%zb5-wB)nJ{^dfH9#d4PITaR*!3Xve+gUw
zd=mIon$P(56d}G0{tA%n?giqfL0}E=8{i#`i-0r1>lpurV?~^*M8X$=G@m}SHJJYs
z;Mc)7v3v-KGBXeZ;x|-tEAT78nLwQF=8teH=K;?E$<8StLJYhGoCE$V;5UIg8Mg!P
z1s?%YoEsSX7$0M#*CyFBfCxPhVoqNZxDLRlXu)_R5G{bc5?>Nx4I%3NI+A}uh{p*b
zpMiU-{X3{0rL`#it~$c}Fb)H?*O|Y`{I|@{GXFjEE6k5Gug4-$`xEmp^FK51WPXbI
zAoF*b?_z$I`ElmwnNzJ#?E>?6n190j4D-v(FEKANr_H72L++G+HS=lAXEUd<I&9z)
z=9n)0bm6uS_kN78;_;t>mko=}9=;dz%Ew>u@bw-pJsgF1#rz%*Z}Rwiz{79%@S<ma
z+%x{1hcEW%|J%cV>e1ig8K3Rp%RTySo^d?$75Sz;`gry##@BepY1<*X+VJdE@HagC
zJr5TqedmRljayyPj%50m;S;``{+&P~a8gJnGcj;0n(h^*d&QSN`DiqmlUZS+2KlbE
zJCjknFJq<J!@_g~VYxmLlkH(>#FEZ;UAATL9YuLkCP<}V^^u&+!D9kN%2Kx|prYoQ
z+R|KKpR|O@6+>xIPp_kiD`m8|q_G|~4$2!zDzf)sIRx*OvQpH2mW0b#GLw}apDSfj
zMzgX*rdBv*2WhdaEmIzEnV9p!QaCkIm-{~+UPFCQm=eX1v!-ov8GEWN)f++Kv@99L
z0-I*m?v<7`Ak4nLWG)*QX1_$;5=F+Q**`dF4oEATNh39T$Xvg`-Ln=M7m>CF=t1=R
u(7%Pg5PcE){pgF)m!N+eJ?JQsW{^~aq#GpVAZZ6lJ-ArthWo#yp!g3nAVDbr

diff --git a/rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/oclBitonicSort_launcher.cpp.o
deleted file mode 100644
index e41805406994004394bc6bd5462cca1b53078625..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7476
zcmbVRe{dA_72imJBtR~$psBjDNN7<>;P@eEwRn<+jom;DL^DcQlFO1@x!fIlcN%FK
ziOEs6$0<~-X&pO^qa#c^CH<p{85^$zOqj;1ZKpD&Q>|775vyYdD7K`ZxBE7Cw+Vvo
zWcRb5_ult?-}k-me($@R*Up_fKT#0!#tFFKBz*faTqK-0zJary_yvvv4=bo4j-$mk
z!l&SzB+;~(-X61cHeVyxugP%d?%$L64*v3Sd-Q99u%ATv!d3$3&@_KEW&}blEYP%E
ze`ncGF~@!uadC)N3-^#TttlFHoC7tj(znK!5jf+Tml5Uoquga5h%X$9W(Mr`SHO8H
z;6?=)C&gEDrCo<?Uv#f&daJg<Xl&DMjeEQ$>~D}8maxBi_?rS5($UH)3phR_?vI5H
zNZkIM@j7j&4qtR=TC<LVIPNaVpF3yS{;58tIMaxKF0kh+LFj^z%36by4${*(0q3<i
zFA^dwMjv&G`JV3{-`&1C-<s7*$48Vu#r$pi@bIu=#`h{_-5$lP?p95CXQi2nL7c2B
z@uoY^<$1O}g#nnapFWru_iXS?@3Ne~aeQxH?40*l=h-~Z?*7it^E`<N;!`>;mx7W^
zSgt*l<H=kxEed~IN(Im3X;UukI45A3o*&l2c``vAdiEdm9@{qz?s)tjc}!LBRg$-n
zmtLAg-qXkF-e(R#vnN~AM|rlCW<PY@IXcQSH+%)kojFRHkgPn+3fPqN(u3JY7gipE
zwA5*7DQR?;va;~=%EHeo3r|)SzFXMuNjw3KG9agEv2lhcm%MS@w1#S?6;nQ-Bp0DM
z%qq)DoSOo=8OSdqP*c@uH6<VL9_gi_DM>50R(EE9BJD|t)JalKD;<XhVQ`;st*^ni
z&Ziy0_+`{Iv7X9nCr>edmz%%CutCiet)p!&HItN&3uH2ri2)>P%uLuhNFjVPQA0ss
zsJN}S;<-p>k|}CYWu`Bfn^tt&v87}fwe@U!^ZfAefILJi?SMQ;6R(#p=&=+%F0nfw
z!tL$$?I3Oy+;*I!>G*(nq6dJvUkPW^w<6WTZPq;9)(yFn;AD!@ot<P;-kwoXo-Jig
zcDXoNZTm*1swr<Bt9jOJ9dPtx&DK{OWimy>vS;(ig!P#tQ$3lYvD$iCW2_nGGY@6j
zQ-*9k{nlCfNe$a4HEeZH`)LS1XS!d~M3=os)TCAT3zqXd5}lL>@c_w#H1&UgiYDcu
zy!6d-iRa-n{~aDS55DnX>|bWxpeYZXSO#?_{LyvUkx_n?x^#`z{43NYllQUflBwI5
zsLPH?4PEZK7&K4ET8=KMKC1n=U8F``K_uNAux5vvq5%mnzJ|-mD?HN$^Q|2D0mv2e
zp!s(tWi9RAw53*3)2NyE^vg2|Y#}#wd4357Ctpe&JMigI{B%?vb#?F%sh@JS`6&sv
zyDWD@cp`!cenDS5$YLcdTrHm_`AsCx9rSY~zjn0!5t2`U9QmN%W5sNETQ&9Ls@e95
zYR1nhogYa&v8-(POy?1{D1O!K)q<7I^UKtETrHaQTNIzBB(Et`lg(uXYBE?>q9$Wy
zrAPW*V`)NcU$P)M>tXKxqVprA$~<3Xo>9y{xhkAozyd2y#T-ymZE4l~P)S)!K1=bV
zcTB-^{<yuhSb=55<tJ5lKuzgG)h1GS!dg;Q^9;AP=3cU4C~?e_s2|-lNgG9H*~Z!<
zd(+-O^1j0C$MQi7%%U;cSyNRt-^q<NySJ&-%C6EE+T~rM-KUw~moN1953+MTO4U6W
znEi{h3;PSz9JbPf@hQo84@XH$Z%W?#VgWMJ+k>5@Wm7nY&7MoJ>2}$e{gWDoOuG<w
zQ-~c~_70<``-^b5hwdI&-bY)c+}Q0hCbt~0Z^l{_UPdE3R(l%4U$^DCM>VZ7DI?80
z4=GtYqYFOebKvkxJ^`+0h~Ak_zT{dA7LxEem*o-?{@5kF1;P`XM=u+<lDuiO{5vGS
zbF{pQ<ntj13Q@y9H|!7I5r~CD0src-5z~T=@sPh&H!A#*h)^WWm@z|=4I^wwGo@Jp
zY4)t>^-^O@3I;+t@i|f;By9);L84?^R3ymlff(LMD=I3A1YCmQ#%53{tB(qQut><|
z!rZx1^@EZ>+}5UtVnq}i9cb@2^hWY0HQ7N*QA8<4jBpEH106%yHN^u#tD}q9jqPuv
zZiMt;v`FCAlnOQMZw!tScpXW#ZnWqtg5jo&ktMR5W#Vtu{TnI%wm>M*7H^ZH@kj*T
z^=9dTu(7el2*)E*G_YBxu@uR*wX15aq*t>!8I@<Oq*-yv&^N^c1{^d!D8*U>QK>N!
z2?qQ~Q#d4nhQhI|ULdoKn)Ukwe!T(}DzB8(Rm*&;wz}$GSyQX3tJd7M$UEN-*}7iC
z_j#LZYK6S^k_k^@Z!DBdo>;Jxo-Bc;ElJK|9?_AajUOHPq%JPmc3D2{ccx6pr&pKG
zi905w#y^t(=s0(~erT;@RJ*AVKQb<rzhitSHaG5eyU9BpquSBpo$2p=`}>FZ7Vri^
zDE|ZV0p`y$-_QK#%%5OxGJlwPjCmV#jd?xuyMVLN&z-;u;H^Nc<>De>IsPhu$dD)k
zafnv{DX!Bf{`J69KnzX%3y}N_0m)Avko>#=B)eY$@hPBuACUY$4MfI716T&W7KoCF
zE153=;)h7Qf%#M*>L*SB&V>D`Lfg;#z{?^3GmzT93Y4H41b!X74_E;FB@ks1pJn|X
zAbuUhpRm3QNd0YLeLD~{K@72eEs*-JV|@+K1HPQ~OMx`5B|v=A5zBzn!Kbml7>K=~
zI1xy80&oiW#|Vz%`v8|2kPiV#e;7C!@_oP?!Jh#l_2OedtnB5F0Fer@3rO}`fN&|U
z0?r0kfK-QDfK-PCK(d<yq<UTrq<T&T-T?VYB=4KR_kj?Je*?|}e-lXKewFzkkn;30
z^Ijn3wTJmHfcO(1!iD^|1Id32^YzSEF;|!`W<HPkH0H(3$tn5!4?45r1rq-!^C9Mk
znfC!_qu&>Rl!tvl%EQw@%EOaD%EOO<<cFSF@@p{P$oyX5b&yLygdo0$!G8<<H6ZcB
z%=a^YmieR16U+_ft;}ngE6nFJzn=N!%nO;HMiHpL=YbT@Rv<zWtAP~HV&>({uVg-j
z`58Q<WPb`s_J3vmI`dxUJ<NZ`d>8ZYGvCB~9rF*6REp=<K!hp=f#kn|`6}kOGhe`b
z2J^|x|A%5w``-hvL;ELy2whwYB!AV+molHj{5s~BGat|VG>S^?PXMj)G9O}|X1*DS
zp@;zC3Ou^qK#+BW_)RI^4aBF+wS<sQUJtw)dP1yc#cP42Cxm=5<{W){Uq@I9eHrTs
zA)kyjiN1MOn~0OXaXa6;Q_M^8(9^Mtxyt-;;t16FzPq3K+gNPq7-W8&`8&)%VNUs_
z_VhiSjw{hOaazRaxRE%{dhTy7^C)ww8|fcrzL<G8^KUaxGhfR5IP+!9i>BE9(Hju?
zyPCPmyqx)6%x`CYH}iVt_c7njd_8d#-1$BqXZ^>l-%K16%=zAqbO;{0@KJ+%5A^dm
zKkFbT{mzND!FtxWvi>pFug79ZN1XM0SbqyR{zi^>ahJ_MbMZkJr|;s~_Tyasu5$5+
z%f7?KDXeUN^)9~9<?kk!{x>fEjEgUK@s%zvXj%JILGuS!>#^@zTUejbLR=1QitBNG
zWo2z+s71F`f`+ZWwU=LQ|IbK^wi;>}_BP7xf~I5hPVK`{{f^8_0ZD50pxziAaWx{S
zv3KHvhW%<RrwzO=4}_YR<L#iPG1h8zui|ZhTC5Jo4Zlu)m(l)SXD{~zVyzj~9kit%
z@$Z;AgYgGf=&@yZVQ36B-$jG13T+6JM~H2kd$4t0LHlc4v%%2Oqo%dAw`&pIh=xOr
z!9eUm?f!W=>8#KZsyhRg&^1Dmsmtc!^x~Wk*Y~R4oBf+A^$m^jU{+|hW{H1eR(kBW
z1a2});kBB~pEuGZ`(?w`DC?!e(k$>tvid<}3ve#Pxd`V?I2Yqwg7apaUMqZxANzk?
f$kw~i4tOEn#v&-*stCPUL`kzqkQMv?Z(ia5$}D|c

diff --git a/rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o b/rtpslib/opencl/bitonic_sort/obj/release/oclSortingNetworks_validate.cpp.o
deleted file mode 100644
index 498398867a963001c0a54a999224b6b7b66992de..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2884
zcma)8U2GIp6u#RIwARw8@)IO1okoZaVr3PiV`E^#uFS$#&~}kTV6E-6-OzUH&P>zd
zgT(EYj6;$5qKQ7#q-r00AO@2@^kECNKcqpdL<uB-F_<<LMWU4u!uXxpyP@6icd}=`
z{mwb}+;i@oy?1}Ob?r_bW1d`w2b6G24jvK|@)YzuIqoC<3(}6aX$&O8k&}^fH;Y6J
zt;3**v_v8`fju=Db{9XT0w9!60Osd2wu}q{J4BF?NTg0T;?dSdZphA;$rIwW-$Wri
zwL5~Bn@NZIw5CWyytYM4o3iswQ8AE|`(#W-bChoY^(_KE&R7$7xq$j&t*FqfGh%UA
zvh!uuDKm!pe2_#U_1c_tkCg8L+s3H><xuG{#+Cxx3K%;FL?3Jyl%$aBLg16ox$6h|
zND@}9tI1hoi7(}w6ic=+OZdb|sm@9Hy7|xesfJ_ZSKd^1D`BOkT21^>7PRgJt$%~o
zIjd^WZ=F(mUZHt>sVM?GJeZsmyYkVQbzU91lA~IIR50<qFd7)6ZUa8YH=DdDc3mK1
zhi@zL4e<jTMCsY)u>uqRo|UTt@&)}p#a`yO&MK8ly3AOW^>>vu<#VV$XH>pyt_V7#
zq1={~=aL<mkOPD52ST}KYUV4kgei7VkWU+Bid`(<h;Q&C*Ym+dN(d%~1mu$oyUY^B
znzT;Od?%LpF|S?hmnZd~P#~&c@(q1reoMZTniwv7OWCXJQ}!#7adf<qBJIFfGG&&|
z4<4W<XU_0uro)co@bH<R8{GM_*RvvMIjUv$Tj4&%n!tkkt*YLTZ5D^@mgV!4D~a14
zu{(w-Y_l&pD|US`cbUhO9{*~`3Y_p;SNO^gj0T3ORMifT`DR1bf9@V-*k2jeZ|BIT
z#O_H%DwQQ-_c*=;d)^mR+w2cnw|wI>Wew$GNf)JJ_NxK~e~GWEbxWnN#E4M2LhSw+
zCM%^lLbsixIQ}GahWzrc=9%P0!>8B_mC67UlOG|-H;a_Z#gaq#j`goODk!$UM^K$1
z`K0--Ga_FyrxkmXUDcZ$7rW@CNBmMAkEqJ<2{G{@%+^TH>mF>XEgY6d+OOK?@caey
zrDhi2!-)N?n6sY`kC>m;-Q%nrPGHy$^vQu?qZrFOpnAgnLC-JKi959abAKT2;OT>S
zFMQ*7i@z=&-b}BG+nfHaH8S&Cd3kwzZF97~*1#Dm>9M$>)l0CLnl9DG<F%bHOKk1h
zwRN$0+-!rdw6qkofW5Y}M$(PicbYW`CRXi!#aiv$QGk@>i8e=#PN^*(Yt!Ne(yY-7
z3fMiTJD_#yn8p=%&XCsHXf#PiEGFqqC~2LfbwsJa0;;82Go&@9gi+}VsZdR7jTusN
zv?WUQrzdUhbSq8goj<NAsvEIJobtL<A9IJc)Eae7_l<X}lA^jB8jII!@$6a9w>#0w
z^mNQ~aMv5m(^0(eGrBN}3-j`i(QlTmgOcSY_JL#-8A$|k7ZrEs2z0zIqO+D{19Wcu
zb~g(dv*HE1eG+*`7WOPSDtwfiJ+3Ts(LH3`0nEOHvFGZ6=*U~n`7=P&<P|y32j+uc
zLtcvi9f+&Yd!F+jfw)q<w^8O|`2XbiHIU+afw+pj^+2TZQeR}>3Piu&jX<REdbxcW
z5FhV$bm_(cA>Lznh>YzgEW+mq5O>#JLf8xU0H1)Luo(Ut?k9x3undTm+fRslu4py)
z6T)6d_Z_tAb6^n)%KV?`<$Msulk`E!f0pxp&M$NR4d+vwALl#|&61pe()cpY2RK(b
zALRTH=PAyQb3V-Z70&5!Ey7%x|0`HKCenk;J|OL7J~n~mSt$A2cubNw{B7J%Zx{I`
zAW1Wn{1x244V(`pH`|=WpU&be(hzNK-t8VVEW(c<t=c_*GFH7Q9*Q+)dw1ZpV38KB
zMb`}c{fRVobVP8(=&@FuV@7ABef`|r<yupAf^_<OI4CojLtEEFeb5)6FG4p!H$q>6
UZh}@oWvFjG3#jS8i$2Ew2W2aa5dZ)H

diff --git a/rtpslib/opencl/bitonic_sort/src/main.cpp b/rtpslib/opencl/bitonic_sort/src/main.cpp
deleted file mode 100644
index 50175c05..00000000
--- a/rtpslib/opencl/bitonic_sort/src/main.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-//Standard utilities and systems includes
-#include <oclUtils.h>
-#include "oclSortingNetworks_common.h"
-
-////////////////////////////////////////////////////////////////////////////////
-//Test driver
-////////////////////////////////////////////////////////////////////////////////
-int main(int argc, const char **argv)
-{
-    cl_platform_id cpPlatform;
-    cl_device_id cdDevice;
-    cl_context cxGPUContext;
-    cl_command_queue cqCommandQueue;
-    cl_mem d_InputKey, d_InputVal, d_OutputKey, d_OutputVal;
-
-    cl_int ciErrNum;
-    uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU;
-
-    const uint dir = 1;
-    const uint N = 1048576;
-    const uint numValues = 65536;
-
-    // set logfile name and start logs
-    shrSetLogFileName ("oclSortingNetworks.txt");
-    shrLog("%s Starting...\n\n", argv[0]); 
-
-    shrLog("Initializing data...\n");
-    h_InputKey      = (uint *)malloc(N * sizeof(uint));
-    h_InputVal      = (uint *)malloc(N * sizeof(uint));
-    h_OutputKeyGPU  = (uint *)malloc(N * sizeof(uint));
-    h_OutputValGPU  = (uint *)malloc(N * sizeof(uint));
-    srand(2009);
-    for (uint i = 0; i < N; i++)
-        h_InputKey[i] = rand() % numValues;
-    fillValues(h_InputVal, N);
-
-    shrLog("Initializing OpenCL...\n");
-    //Get the NVIDIA platform
-    ciErrNum = oclGetPlatformID(&cpPlatform);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //Get the devices
-    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //Create the context
-    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //Create a command-queue
-    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    shrLog("Initializing OpenCL bitonic sorter...\n");
-    initBitonicSort(cxGPUContext, cqCommandQueue, argv);
-
-    shrLog("Creating OpenCL memory objects...\n\n");
-    d_InputKey = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N * sizeof(cl_uint), h_InputKey, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-    d_InputVal = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N * sizeof(cl_uint), h_InputVal, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-    d_OutputKey = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(cl_uint), NULL, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-    d_OutputVal = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(cl_uint), NULL, &ciErrNum);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //Temp storage for key array validation routine
-    uint *srcHist = (uint *)malloc(numValues * sizeof(uint));
-    uint *resHist = (uint *)malloc(numValues * sizeof(uint));
-
-#ifdef GPU_PROFILING
-    cl_event startTime, endTime;
-#endif
-
-    int globalFlag = 1;// init pass/fail flag to pass
-    for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2)
-    {
-        shrLog("Test array length %u (%u arrays in the batch)...\n", arrayLength, N / arrayLength);
-
-#ifdef GPU_PROFILING
-        clFinish(cqCommandQueue);
-        ciErrNum = clEnqueueMarker(cqCommandQueue, &startTime);
-        oclCheckError(ciErrNum, CL_SUCCESS);
-        shrDeltaT(0);
-#endif
-
-        size_t szWorkgroup = bitonicSort(
-                                        NULL,
-                                        d_OutputKey,
-                                        d_OutputVal,
-                                        d_InputKey,
-                                        d_InputVal,
-                                        N / arrayLength,
-                                        arrayLength,
-                                        dir
-                                        );
-        oclCheckError(szWorkgroup > 0, true); 
-
-#ifdef GPU_PROFILING
-        if (arrayLength == N)
-        {
-            ciErrNum = clEnqueueMarker(cqCommandQueue, &endTime);
-            oclCheckError(ciErrNum, CL_SUCCESS);
-            clFinish(cqCommandQueue);
-            double timerValue = shrDeltaT(0);
-            shrLogEx(LOGBOTH | MASTER, 0, "oclSortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", 
-                     (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
-
-            cl_ulong startTimeVal = 0, endTimeVal = 0;
-            ciErrNum = clGetEventProfilingInfo(
-                                              startTime, 
-                                              CL_PROFILING_COMMAND_END, 
-                                              sizeof(cl_ulong),
-                                              &startTimeVal,
-                                              NULL
-                                              );
-
-            ciErrNum = clGetEventProfilingInfo(
-                                              endTime, 
-                                              CL_PROFILING_COMMAND_END, 
-                                              sizeof(cl_ulong),
-                                              &endTimeVal,
-                                              NULL
-                                              );
-
-            shrLog("OpenCL time: %.5f s\n", 1.0e-9 * (double)(endTimeVal - startTimeVal));
-        }
-#endif
-
-        //Reading back results from device to host
-        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_OutputKey, CL_TRUE, 0, N * sizeof(cl_uint), h_OutputKeyGPU, 0, NULL, NULL);
-        oclCheckError(ciErrNum, CL_SUCCESS);
-        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_OutputVal, CL_TRUE, 0, N * sizeof(cl_uint), h_OutputValGPU, 0, NULL, NULL);
-        oclCheckError(ciErrNum, CL_SUCCESS);
-
-        //Check if keys array is not corrupted and properly ordered
-        int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, dir, srcHist, resHist);
-
-        //Check if values array is not corrupted
-        int valuesFlag = validateSortedValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength);
-
-        // accumulate any error or failure
-        globalFlag = globalFlag && keysFlag && valuesFlag;
-    }
-
-    // pass or fail (cumulative... all tests in the loop)
-    shrLog("%s\n\n", globalFlag ? "PASSED" : "FAILED");
-
-    // Start Cleanup
-    shrLog("Shutting down...\n");
-    //Discard temp storage for key validation routine
-    free(srcHist);
-    free(resHist);
-
-    //Release kernels and program
-    closeBitonicSort();
-
-    //Release other OpenCL Objects
-    ciErrNum  = clReleaseMemObject(d_OutputVal);
-    ciErrNum |= clReleaseMemObject(d_OutputKey);
-    ciErrNum |= clReleaseMemObject(d_InputVal);
-    ciErrNum |= clReleaseMemObject(d_InputKey);
-    ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
-    ciErrNum |= clReleaseContext(cxGPUContext);
-    oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //Release host buffers
-    free(h_OutputValGPU);
-    free(h_OutputKeyGPU);
-    free(h_InputVal);
-    free(h_InputKey);
-
-    //Finish
-    shrEXIT(argc, argv);
-}
diff --git a/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp b/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp
deleted file mode 100644
index 5ce03f5d..00000000
--- a/rtpslib/opencl/bitonic_sort/src/oclBitonicSort_launcher.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-//#include <oclUtils.h>
-#include "CL/cl.hpp"
-#include <string.h>
-#include <stdio.h>
-#include "oclSortingNetworks_common.h"
-
-extern "C" void closeBitonicSort(void);
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenCL launcher for bitonic sort kernel
-////////////////////////////////////////////////////////////////////////////////
-//OpenCL bitonic sort program
-static cl_program cpBitonicSort;
-
-//OpenCL bitonic sort kernels
-static cl_kernel
-    ckBitonicSortLocal,
-    ckBitonicSortLocal1,
-    ckBitonicMergeGlobal,
-    ckBitonicMergeLocal;
-
-//Default command queue for bitonic kernels
-static cl_command_queue cqDefaultCommandQue;
-
-static const uint LOCAL_SIZE_LIMIT = 512U;
-static const char  *compileOptions = "-D LOCAL_SIZE_LIMIT=512";
-
-extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){
-    cl_int ciErrNum;
-    size_t kernelLength;
-
-    //shrLog("...loading BitonicSort.cl\n");
-        //char *cBitonicSort = oclLoadProgSource(shrFindFilePath("BitonicSort.cl", argv[0]), "// My comment\n", &kernelLength);
-        //oclCheckError(cBitonicSort != NULL, shrTRUE);
-
-    std::string paths(BITONIC_CL_SOURCE_DIR);
-	paths = paths + "/BitonicSort.cl";
-	const char* pathr = paths.c_str();
-	FILE* fd =fopen(pathr, "r");
-	//printf("fd= %d\n", fd);
-	char* cBitonicSort = new char [30000];
-	int nb = fread(cBitonicSort, 1, 30000, fd);
-	kernelLength = nb;
-	//printf("pathr= %s\n", pathr);
-	//printf("cBitonicSort= %s\n", cBitonicSort);
-
-    //shrLog("...creating bitonic sort program\n");
-        cpBitonicSort = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBitonicSort, &kernelLength, &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //shrLog("...building bitonic sort program\n");
-        ciErrNum = clBuildProgram(cpBitonicSort, 0, NULL, compileOptions, NULL, NULL);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //shrLog( "...creating bitonic sort kernels\n");
-        ckBitonicSortLocal = clCreateKernel(cpBitonicSort, "bitonicSortLocal", &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-        ckBitonicSortLocal1 = clCreateKernel(cpBitonicSort, "bitonicSortLocal1", &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-        ckBitonicMergeGlobal = clCreateKernel(cpBitonicSort, "bitonicMergeGlobal", &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-        ckBitonicMergeLocal = clCreateKernel(cpBitonicSort, "bitonicMergeLocal", &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-    //shrLog( "...checking minimum supported workgroup size\n");
-        //Check for work group size
-        cl_device_id device;
-        size_t szBitonicSortLocal, szBitonicSortLocal1, szBitonicMergeLocal;
-
-        ciErrNum  = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL);
-        ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicSortLocal,  device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicSortLocal, NULL);
-        ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicSortLocal1, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicSortLocal1, NULL);
-        ciErrNum |= clGetKernelWorkGroupInfo(ckBitonicMergeLocal, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szBitonicMergeLocal, NULL);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-        if( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ){
-            //shrLog("\nERROR !!! Minimum work-group size %u required by this application is not supported on this device.\n\n", LOCAL_SIZE_LIMIT / 2);
-            closeBitonicSort();
-            free(cBitonicSort);
-            //shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
-            exit(EXIT_FAILURE);
-        }
-
-    //Save default command queue
-    cqDefaultCommandQue = cqParamCommandQue;
-
-    //Discard temp storage
-    free(cBitonicSort);
-}
-
-extern "C" void closeBitonicSort(void)
-{
-    cl_int ciErrNum;
-    ciErrNum  = clReleaseKernel(ckBitonicMergeLocal);
-    ciErrNum |= clReleaseKernel(ckBitonicMergeGlobal);
-    ciErrNum |= clReleaseKernel(ckBitonicSortLocal1);
-    ciErrNum |= clReleaseKernel(ckBitonicSortLocal);
-    ciErrNum |= clReleaseProgram(cpBitonicSort);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
-}
-
-static cl_uint factorRadix2(cl_uint& log2L, cl_uint L){
-    if(!L){
-        log2L = 0;
-        return 0;
-    }else{
-        for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
-        return L;
-    }
-}
-
-extern "C" size_t bitonicSort(
-    cl_command_queue cqCommandQueue,
-    cl_mem d_DstKey,
-    cl_mem d_DstVal,
-    cl_mem d_SrcKey,
-    cl_mem d_SrcVal,
-    uint batch,
-    uint arrayLength,
-    uint dir
-){
-    if(arrayLength < 2)
-        return 0;
-
-    //Only power-of-two array lengths are supported so far
-    cl_uint log2L;
-    cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength);
-    //oclCheckError( factorizationRemainder == 1, shrTRUE );
-
-    if(!cqCommandQueue)
-        cqCommandQueue = cqDefaultCommandQue;
-
-    dir = (dir != 0);
-
-    cl_int ciErrNum;
-    size_t localWorkSize; 
-    size_t globalWorkSize;
-
-    if(arrayLength <= LOCAL_SIZE_LIMIT)
-    {
-        //oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
-        //Launch bitonicSortLocal
-        ciErrNum  = clSetKernelArg(ckBitonicSortLocal, 0,   sizeof(cl_mem), (void *)&d_DstKey);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 1,   sizeof(cl_mem), (void *)&d_DstVal);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 2,   sizeof(cl_mem), (void *)&d_SrcKey);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 3,   sizeof(cl_mem), (void *)&d_SrcVal);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 4,  sizeof(cl_uint), (void *)&arrayLength);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal, 5,  sizeof(cl_uint), (void *)&dir);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-        localWorkSize  = LOCAL_SIZE_LIMIT / 2;
-        globalWorkSize = batch * arrayLength / 2;
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-    }
-    else
-    {
-        //Launch bitonicSortLocal1
-        ciErrNum  = clSetKernelArg(ckBitonicSortLocal1, 0,  sizeof(cl_mem), (void *)&d_DstKey);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 1,  sizeof(cl_mem), (void *)&d_DstVal);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 2,  sizeof(cl_mem), (void *)&d_SrcKey);
-        ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 3,  sizeof(cl_mem), (void *)&d_SrcVal);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-        localWorkSize = LOCAL_SIZE_LIMIT / 2;
-        globalWorkSize = batch * arrayLength / 2;
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
-
-        for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1)
-        {
-            for(unsigned stride = size / 2; stride > 0; stride >>= 1)
-            {
-                if(stride >= LOCAL_SIZE_LIMIT)
-                {
-                    //Launch bitonicMergeGlobal
-                    ciErrNum  = clSetKernelArg(ckBitonicMergeGlobal, 0,  sizeof(cl_mem), (void *)&d_DstKey);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 1,  sizeof(cl_mem), (void *)&d_DstVal);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 2,  sizeof(cl_mem), (void *)&d_DstKey);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 3,  sizeof(cl_mem), (void *)&d_DstVal);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 4, sizeof(cl_uint), (void *)&arrayLength);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 5, sizeof(cl_uint), (void *)&size);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 6, sizeof(cl_uint), (void *)&stride);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 7, sizeof(cl_uint), (void *)&dir);
-                    //oclCheckError(ciErrNum, CL_SUCCESS);
-
-                    globalWorkSize = batch * arrayLength / 2;
-                    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeGlobal, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
-                    //oclCheckError(ciErrNum, CL_SUCCESS);
-                }
-                else
-                {
-                    //Launch bitonicMergeLocal
-                    ciErrNum  = clSetKernelArg(ckBitonicMergeLocal, 0,  sizeof(cl_mem), (void *)&d_DstKey);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 1,  sizeof(cl_mem), (void *)&d_DstVal);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 2,  sizeof(cl_mem), (void *)&d_DstKey);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 3,  sizeof(cl_mem), (void *)&d_DstVal);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 4, sizeof(cl_uint), (void *)&arrayLength);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 5, sizeof(cl_uint), (void *)&stride);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 6, sizeof(cl_uint), (void *)&size);
-                    ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 7, sizeof(cl_uint), (void *)&dir);
-                    //oclCheckError(ciErrNum, CL_SUCCESS);
-
-                    localWorkSize  = LOCAL_SIZE_LIMIT / 2;
-                    globalWorkSize = batch * arrayLength / 2;
-
-                    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
-                    //oclCheckError(ciErrNum, CL_SUCCESS);
-                    break;
-                }
-            }
-        }
-    }
-    return localWorkSize;
-}
diff --git a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h b/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h
deleted file mode 100644
index 4d6cd650..00000000
--- a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_common.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-////////////////////////////////////////////////////////////////////////////////
-// Shortcut typename
-////////////////////////////////////////////////////////////////////////////////
-typedef unsigned int uint;
-
-////////////////////////////////////////////////////////////////////////////////
-// Host-side validation routines
-////////////////////////////////////////////////////////////////////////////////
-extern "C" int validateSortedKeys(
-    uint *result,
-    uint *data,
-    uint batch,
-    uint N,
-    uint numValues,
-    uint dir,
-    uint *srcHist,
-    uint *resHist
-);
-
-extern "C" void fillValues(
-    uint *val,
-    uint N
-);
-
-extern "C" int validateSortedValues(
-    uint *resKey,
-    uint *resVal,
-    uint *srcKey,
-    uint batchSize,
-    uint arrayLength
-);
-
-extern "C" int validateValues(
-    uint *oval,
-    uint *okey,
-    uint N
-);
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenCL bitonic sort
-////////////////////////////////////////////////////////////////////////////////
-#if 0
-extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
-
-extern "C" void closeBitonicSort(void);
-
-extern"C" size_t bitonicSort(
-    cl_command_queue cqCommandQueue,
-    cl_mem d_DstKey,
-    cl_mem d_DstVal,
-    cl_mem d_SrcKey,
-    cl_mem d_SrcVal,
-    uint batch,
-    uint arrayLength,
-    uint dir
-);
-#endif
diff --git a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp b/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp
deleted file mode 100644
index 59330087..00000000
--- a/rtpslib/opencl/bitonic_sort/src/oclSortingNetworks_validate.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-#include <oclUtils.h>
-#include "oclSortingNetworks_common.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Validate sorted keys array (check for integrity and proper order)
-// returns 1 if correct/pass, returns 0 if incorrect/fail
-////////////////////////////////////////////////////////////////////////////////
-extern "C" int validateSortedKeys(
-    uint *resKey,
-    uint *srcKey,
-    uint batch,
-    uint N,
-    uint numValues,
-    uint dir,
-    uint *srcHist,
-    uint *resHist
-){
-    shrLog("...validating sorted keys: ");
-
-    if(N < 2){
-        shrLog("arrayLength too short, exiting\n");
-        return 1;
-    }
-
-    for(uint j = 0; j < batch; j++, srcKey += N, resKey += N){
-        memset(srcHist, 0, numValues * sizeof(uint));
-        memset(resHist, 0, numValues * sizeof(uint));
-
-        //Build histograms for current array
-        for(uint i = 0; i < N; i++)
-            if( (srcKey[i] < numValues) && (resKey[i] < numValues) ){
-                srcHist[srcKey[i]]++;
-                resHist[resKey[i]]++;
-            }else{
-                shrLog("***Set %u key arrays are not limited properly***\n", j);
-                return 0;
-            }
-
-        //Compare the histograms
-        for(uint i = 0; i < numValues; i++)
-            if(srcHist[i] != resHist[i]){
-                shrLog("***Set %u key histograms do not match***\n", j);
-                return 0;
-            }
-
-        //Check the ordering
-        for(uint i = 0; i < N - 1; i++)
-            if( (dir && (resKey[i] > resKey[i + 1])) || (!dir && (resKey[i] < resKey[i + 1])) ){
-                shrLog("***Set %u key array is not ordered properly***\n", j);
-                return 0;
-            }
-    }
-
-    //All checks passed
-    shrLog("OK\n");
-    return 1; 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Generate input values from input keys
-// Check output values to match output keys by the same translation function
-// returns 1 if correct/pass, returns 0 if incorrect/fail
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-// Value validation / stability check routines
-////////////////////////////////////////////////////////////////////////////////
-extern "C" void fillValues(
-    uint *val,
-    uint N
-){
-    for(uint i = 0; i < N; i++)
-        val[i] = i;
-}
-
-extern "C" int validateSortedValues(
-    uint *resKey,
-    uint *resVal,
-    uint *srcKey,
-    uint batchSize,
-    uint arrayLength
-){
-    int stableFlag = 1;
-
-    shrLog( "...validating sorted values array: ");
-    for(uint j = 0; j < batchSize; j++, resKey += arrayLength, resVal += arrayLength){
-        for(uint i = 0; i < arrayLength; i++){
-            if( (resVal[i] < j * arrayLength) || (resVal[i] >= (j + 1) * arrayLength) || (resKey[i] != srcKey[resVal[i]]) ){
-                shrLog("***corrupted!!!***\n");
-                return 0;
-            }
-
-            if( (i + 1 < arrayLength) && (resKey[i] == resKey[i + 1]) && (resVal[i] > resVal[i + 1]) )
-                stableFlag = 0;
-        }
-    }
-
-    shrLog("OK\n...stability property: %s\n\n", stableFlag ?  "Stable" : "NOT stable !!!");
-    return 1;
-}
diff --git a/rtpslib/system/SPH.cpp b/rtpslib/system/SPH.cpp
index ff6a9e2f..c336c62c 100644
--- a/rtpslib/system/SPH.cpp
+++ b/rtpslib/system/SPH.cpp
@@ -94,6 +94,7 @@ namespace rtps
 
         hash = Hash(common_source_dir, ps->cli, timers["hash_gpu"]);
         bitonic = Bitonic<unsigned int>(common_source_dir, ps->cli );
+        radix = Radix<unsigned int>(common_source_dir, ps->cli, max_num, 128);
         cellindices = CellIndices(common_source_dir, ps->cli, timers["ci_gpu"] );
         permute = Permute( common_source_dir, ps->cli, timers["perm_gpu"] );
 
@@ -367,6 +368,7 @@ namespace rtps
         //defined in Sort.cpp
         timers["bitonic"]->start();
         bitonic_sort();
+        //radix_sort();
         timers["bitonic"]->stop();
     }
 
@@ -865,6 +867,25 @@ namespace rtps
         //renderer->setParticleRadius(spacing*0.5);
         renderer->setParticleRadius(spacing);
 		//renderer->setRTPS(
+    }
+	//----------------------------------------------------------------------
+    void SPH::radix_sort()
+    {
+    try 
+        {   
+            int snum = nlpo2(num);
+            if(snum < 1024)
+            {   
+                snum = 1024;
+            }   
+            //printf("sorting snum: %d", snum); 
+            radix.sort(snum, &cl_sort_hashes, &cl_sort_indices);
+        }   
+        catch (cl::Error er) 
+        {   
+            printf("ERROR(radix sort): %s(%s)\n", er.what(), oclErrorString(er.err()));
+        }   
+
     }
 	//----------------------------------------------------------------------
     void SPH::bitonic_sort()
diff --git a/rtpslib/system/SPH.h b/rtpslib/system/SPH.h
index 75722b6a..789c3d2d 100644
--- a/rtpslib/system/SPH.h
+++ b/rtpslib/system/SPH.h
@@ -27,6 +27,7 @@ class OUTER;
 //#include <Prep.h>
 #include <Hash.h>
 #include <BitonicSort.h>
+#include <Radix.h>
 //#include <DataStructures.h>
 #include <CellIndices.h>
 #include <Permute.h> // contains CloudPermute
@@ -183,6 +184,7 @@ namespace rtps
         Buffer<unsigned int>         cl_sort_output_indices;
 
         Bitonic<unsigned int> bitonic;
+        Radix<unsigned int> radix;
 
         //Parameter structs
         Buffer<SPHParams>   cl_sphp;
@@ -223,6 +225,7 @@ namespace rtps
         void hash_and_sort();
         void cloud_hash_and_sort();  // GE
         void bitonic_sort();
+        void radix_sort();
         void cloud_bitonic_sort();   // GE
         Density density;
         Force force;
diff --git a/rtpslib/system/common/Radix.cpp b/rtpslib/system/common/Radix.cpp
new file mode 100644
index 00000000..b2ce20ee
--- /dev/null
+++ b/rtpslib/system/common/Radix.cpp
@@ -0,0 +1,317 @@
+
+/**
+ * C++ port of NVIDIA's Radix Sort implementation with Key-Value instead of Keys Only
+ */
+
+
+template <class T>
+Radix<T>::Radix(std::string source_dir, CL *cli, int max_elements, int cta_size )
+{
+    this->cli = cli;
+
+    WARP_SIZE = 128;//32
+    SCAN_WG_SIZE = 128;//256
+    MIN_LARGE_ARRAY_SIZE = 4 * SCAN_WG_SIZE;
+    bit_step = 4;
+    //maybe cta_size should be passed to the call instead of the constructor
+    this->cta_size = cta_size;
+    uintsz = sizeof(T);
+
+    loadKernels(source_dir);
+
+    int num_blocks;
+    if ((max_elements % (cta_size * 4)) == 0)
+    {
+        num_blocks = max_elements / (cta_size * 4);
+    }
+    else
+    {
+        num_blocks = max_elements / (cta_size * 4) + 1;
+    }
+
+    vector<unsigned int> tmp(max_elements);
+    d_tempKeys = Buffer<unsigned int>(cli, tmp);
+    d_tempValues = Buffer<unsigned int>(cli, tmp);
+
+    tmp.resize(WARP_SIZE * num_blocks);
+    mCounters = Buffer<unsigned int>(cli, tmp);
+    mCountersSum = Buffer<unsigned int>(cli, tmp);
+    mBlockOffsets = Buffer<unsigned int>(cli, tmp);
+
+    int numscan = max_elements/2/cta_size*16;
+    if (numscan >= MIN_LARGE_ARRAY_SIZE)
+    {
+    //#MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024
+        tmp.resize(numscan / 1024);
+        scan_buffer = Buffer<unsigned int>(cli, tmp);
+    }
+
+
+
+}
+
+template <class T>
+void Radix<T>::loadKernels(std::string source_dir)
+{
+    string radix_source = source_dir + "/RadixSort.cl";
+
+    std::string options = "-D LOCAL_SIZE_LIMIT=512";
+    cl::Program prog = cli->loadProgram(radix_source, options);
+    //printf("radix sort\n");
+    
+    //printf("load scanNaive\n");
+    k_scanNaive = Kernel(cli, prog, "scanNaive");
+    //printf("load radixSortBlockKeysValues\n");
+    //k_radixSortBlockKeysValues = Kernel(cli, prog, "radixSortBlockKeysValues");
+    //printf("load radixSortBlocksKeysValues\n");
+    k_radixSortBlocksKeysValues = Kernel(cli, prog, "radixSortBlocksKeysValues");
+    //printf("load reorderDataKeysValues\n");
+    k_reorderDataKeysValues = Kernel(cli, prog, "reorderDataKeysValues");
+    //printf("load findRadixOffsets\n");
+    k_findRadixOffsets = Kernel(cli, prog, "findRadixOffsets");
+
+    string scan_source = source_dir + "/Scan_b.cl";
+
+    options = "-D LOCAL_SIZE_LIMIT=512";
+    prog = cli->loadProgram(scan_source, options);
+    
+    k_scanExclusiveLocal1 = Kernel(cli, prog, "scanExclusiveLocal1");
+    k_scanExclusiveLocal2 = Kernel(cli, prog, "scanExclusiveLocal2");
+    k_uniformUpdate = Kernel(cli, prog, "uniformUpdate");
+    
+ 
+
+    //TODO: implement this check with the C++ API    
+    //if( (szRadixSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szRadixSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) || (szRadixMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ){
+            //shrLog("\nERROR !!! Minimum work-group size %u required by this application is not supported on this device.\n\n", LOCAL_SIZE_LIMIT / 2);
+}
+
+template <class T>
+void Radix<T>::sort(int num, Buffer<T>* keys, Buffer<T>* values)
+{
+        //printf("radix sort routine\n");
+        this->keys = keys;
+        this->values = values;
+        int key_bits = sizeof(T) * 8;
+        //printf("sorting %d\n", num);
+        //printf("key_bits %d\n", key_bits);
+        //printf("bit_step %d\n", bit_step);
+
+        int i = 0;
+        while(key_bits > i*bit_step)
+        {
+            //printf("i*bit_step %d\n", i*bit_step)
+            step(bit_step, i*bit_step, num);
+            i += 1;
+        }
+        cli->queue.finish();
+}
+
+template <class T>
+void Radix<T>::step(int nbits, int startbit, int num)
+{
+        blocks(nbits, startbit, num);
+        cli->queue.finish();
+
+        find_offsets(startbit, num);
+        cli->queue.finish();
+
+        int array_length = num/2/cta_size*16;
+        if(array_length < MIN_LARGE_ARRAY_SIZE)
+        {
+            naive_scan(num);
+        }
+        else
+        {
+            scan(&mCountersSum, &mCounters, 1, array_length);
+        }
+        cli->queue.finish();
+
+        reorder(startbit, num);
+        cli->queue.finish();
+}
+
+template <class T>
+void Radix<T>::blocks(int nbits, int startbit, int num)
+{
+        int totalBlocks = num/4/cta_size;
+        int global_size = cta_size*totalBlocks;
+        int local_size = cta_size;
+
+        int arg = 0;
+        k_radixSortBlocksKeysValues.setArg(arg++, keys->getDevicePtr());
+        k_radixSortBlocksKeysValues.setArg(arg++, values->getDevicePtr());
+        k_radixSortBlocksKeysValues.setArg(arg++, d_tempKeys.getDevicePtr());
+        k_radixSortBlocksKeysValues.setArg(arg++, d_tempValues.getDevicePtr());
+        k_radixSortBlocksKeysValues.setArg(arg++, nbits);
+        k_radixSortBlocksKeysValues.setArg(arg++, startbit);
+        k_radixSortBlocksKeysValues.setArg(arg++, num);
+        k_radixSortBlocksKeysValues.setArg(arg++, totalBlocks);
+        k_radixSortBlocksKeysValues.setArgShared(arg++, 4 * cta_size * sizeof(T));
+        k_radixSortBlocksKeysValues.setArgShared(arg++, 4 * cta_size * sizeof(T));
+
+        k_radixSortBlocksKeysValues.execute(global_size, local_size);
+
+}
+template <class T>
+void Radix<T>::find_offsets(int startbit, int num)
+{
+        int totalBlocks = num/2/cta_size;
+        int global_size = cta_size*totalBlocks;
+        int local_size = cta_size;
+        int arg = 0;
+        k_findRadixOffsets.setArg(arg++, d_tempKeys.getDevicePtr());
+        k_findRadixOffsets.setArg(arg++, d_tempValues.getDevicePtr());
+        k_findRadixOffsets.setArg(arg++, mCounters.getDevicePtr());
+        k_findRadixOffsets.setArg(arg++, mBlockOffsets.getDevicePtr());
+        k_findRadixOffsets.setArg(arg++, startbit);
+        k_findRadixOffsets.setArg(arg++, num);
+        k_findRadixOffsets.setArg(arg++, totalBlocks);
+        k_findRadixOffsets.setArgShared(arg++, 2 * cta_size * sizeof(T));
+
+        k_findRadixOffsets.execute(global_size, local_size);
+}
+
+template <class T>
+void Radix<T>::naive_scan(int num)
+{
+        int nhist = num/2/cta_size*16;
+        int global_size = nhist;
+        int local_size = nhist;
+        int extra_space = nhist / 16;// #NUM_BANKS defined as 16 in RadixSort.cpp (original NV implementation)
+        int shared_mem_size = sizeof(T) * (nhist + extra_space);
+        int arg = 0;
+        k_scanNaive.setArg(arg++, mCountersSum.getDevicePtr()); 
+        k_scanNaive.setArg(arg++, mCounters.getDevicePtr()); 
+        k_scanNaive.setArg(arg++, nhist); 
+        k_scanNaive.setArgShared(arg++, 2*shared_mem_size); 
+        
+        k_scanNaive.execute(global_size, local_size);
+}
+template <class T>
+void Radix<T>::scan( Buffer<T>* dst, Buffer<T>* src, int batch_size, int array_length)
+{
+        scan_local1(dst, 
+                    src, 
+                    batch_size * array_length / (4 * SCAN_WG_SIZE), 
+                    4 * SCAN_WG_SIZE);
+        
+        cli->queue.finish();
+        scan_local2(dst, 
+                    src, 
+                    batch_size,
+                    array_length / (4 * SCAN_WG_SIZE));
+        cli->queue.finish();
+        scan_update(dst, batch_size * array_length / (4 * SCAN_WG_SIZE));
+        cli->queue.finish();
+
+}
+
+template <class T>
+void Radix<T>::scan_local1( Buffer<T>* dst, Buffer<T>* src, int n, int size)
+{
+    int global_size = n * size / 4;
+    int local_size = SCAN_WG_SIZE;
+    int arg = 0;
+    k_scanExclusiveLocal1.setArg(arg++, dst->getDevicePtr());
+    k_scanExclusiveLocal1.setArg(arg++, src->getDevicePtr());
+    k_scanExclusiveLocal1.setArgShared(arg++, 2 * SCAN_WG_SIZE * sizeof(T));
+    k_scanExclusiveLocal1.setArg(arg++, size);
+    
+    k_scanExclusiveLocal1.execute(global_size, local_size);
+}
+
+template <class T>
+void Radix<T>::scan_local2( Buffer<T>* dst, Buffer<T>* src, int n, int size)
+{
+    int elements = n * size;
+    int dividend = elements;
+    int divisor = SCAN_WG_SIZE;
+    int global_size;
+    if (dividend % divisor == 0)
+        global_size = dividend;
+    else
+        global_size = dividend - dividend % divisor + divisor;
+    int local_size = SCAN_WG_SIZE;
+
+    int arg = 0;
+    k_scanExclusiveLocal2.setArg(arg++, scan_buffer.getDevicePtr());
+    k_scanExclusiveLocal2.setArg(arg++, dst->getDevicePtr());
+    k_scanExclusiveLocal2.setArg(arg++, src->getDevicePtr());
+    k_scanExclusiveLocal2.setArgShared(arg++, 2 * SCAN_WG_SIZE * sizeof(T));
+    k_scanExclusiveLocal2.setArg(arg++, elements);
+    k_scanExclusiveLocal2.setArg(arg++, size);
+    k_scanExclusiveLocal2.execute(global_size, local_size);
+}
+
+template <class T>
+void Radix<T>::scan_update( Buffer<T>* dst, int n)
+{
+    int global_size = n * SCAN_WG_SIZE;
+    int local_size = SCAN_WG_SIZE;
+    int arg = 0;
+    k_uniformUpdate.setArg(arg++, dst->getDevicePtr());
+    k_uniformUpdate.setArg(arg++, scan_buffer.getDevicePtr());
+    k_uniformUpdate.execute(global_size, local_size);
+}
+
+template <class T>
+void Radix<T>::reorder( int startbit, int num)
+{
+        int totalBlocks = num/2/cta_size;
+        int global_size = cta_size*totalBlocks;
+        int local_size = cta_size;
+        int arg = 0;
+        k_reorderDataKeysValues.setArg(arg++, keys->getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, values->getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, d_tempKeys.getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, d_tempValues.getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, mBlockOffsets.getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, mCountersSum.getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, mCounters.getDevicePtr());
+        k_reorderDataKeysValues.setArg(arg++, startbit);
+        k_reorderDataKeysValues.setArg(arg++, num);
+        k_reorderDataKeysValues.setArg(arg++, totalBlocks);
+        k_reorderDataKeysValues.setArgShared(arg++, 2 * cta_size * sizeof(T));
+        k_reorderDataKeysValues.setArgShared(arg++, 2 * cta_size * sizeof(T));
+        k_reorderDataKeysValues.execute(global_size, local_size);
+}
+
+
+#if 0
+if __name__ == "__main__":
+
+    n = 1048576*2
+    #n = 32768*2
+    #n = 16384
+    #n = 8192
+    hashes = np.ndarray((n,1), dtype=np.uint32)
+    indices = np.ndarray((n,1), dtype=np.uint32)
+    
+    for i in xrange(0,n): 
+        hashes[i] = n - i
+        indices[i] = i
+    
+    npsorted = np.sort(hashes,0)
+
+    print "hashes before:", hashes[0:20].T
+    print "indices before: ", indices[0:20].T 
+
+    radix = Radix(n, 128, hashes.dtype)
+    #num_to_sort = 32768
+    num_to_sort = n
+    hashes, indices = radix.sort(num_to_sort, hashes, indices)
+
+    print "hashes after:", hashes[0:20].T
+    print "indices after: ", indices[0:20].T 
+
+    print np.linalg.norm(hashes - npsorted)
+
+    print timings
+
+
+#endif
+
+
+
+
diff --git a/rtpslib/system/common/Radix.h b/rtpslib/system/common/Radix.h
new file mode 100644
index 00000000..1c348a42
--- /dev/null
+++ b/rtpslib/system/common/Radix.h
@@ -0,0 +1,77 @@
+#ifndef RTPS_RADIX_SORT_H
+#define RTPS_RADIX_SORT_H
+
+#include "CLL.h"
+#include "Kernel.h"
+#include "Buffer.h"
+
+#include <vector>
+using namespace std;
+
+#ifndef uint
+#define uint unsigned int
+#endif
+
+
+namespace rtps 
+{
+
+template <class T>
+class Radix 
+{
+public:
+    static const uint LOCAL_SIZE_LIMIT = 512U;
+    Radix(){ cli=NULL; };
+    //create an OpenCL buffer from existing data
+    Radix( std::string source_dir, CL *cli, int max_elements, int cta_size );
+
+    void loadKernels(std::string source_dir);
+
+    void sort(int num, Buffer<T>* keys, Buffer<T>* values);
+    void step(int nbits, int startbit, int num);
+    void blocks(int nbits, int startbit, int num);
+    void find_offsets(int startbit, int num);
+    void naive_scan(int num);
+
+    void scan( Buffer<T>* dst, Buffer<T>* src, int batch_size, int array_length);
+    void scan_local1( Buffer<T>* dst, Buffer<T>* src, int n, int size);
+    void scan_local2( Buffer<T>* dst, Buffer<T>* src, int n, int size);
+    void scan_update( Buffer<T>* dst, int n);
+    void reorder( int startbit, int num);
+
+private:
+    Kernel k_scanNaive;
+    //Kernel k_radixSortBlockKeysValues; 
+    Kernel k_radixSortBlocksKeysValues;
+    Kernel k_reorderDataKeysValues;
+    Kernel k_findRadixOffsets;
+    Kernel k_scanExclusiveLocal1;
+    Kernel k_scanExclusiveLocal2;
+    Kernel k_uniformUpdate;
+ 
+
+    CL *cli;
+
+    int WARP_SIZE;
+    int SCAN_WG_SIZE;
+    int MIN_LARGE_ARRAY_SIZE;
+    int bit_step;
+    int cta_size;
+    size_t uintsz;
+
+    Buffer<T>* keys;
+    Buffer<T>* values;
+    //these should probably be type T
+    Buffer<unsigned int> d_tempKeys;
+    Buffer<unsigned int> d_tempValues;
+    Buffer<unsigned int> mCounters;
+    Buffer<unsigned int> mCountersSum;
+    Buffer<unsigned int> mBlockOffsets;
+    Buffer<unsigned int> scan_buffer;
+
+};
+
+
+#include "Radix.cpp"
+}
+#endif
diff --git a/rtpslib/system/common/cl_src/RadixSort.cl b/rtpslib/system/common/cl_src/RadixSort.cl
new file mode 100644
index 00000000..895fb2d9
--- /dev/null
+++ b/rtpslib/system/common/cl_src/RadixSort.cl
@@ -0,0 +1,630 @@
+/*
+* Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+*
+* NVIDIA Corporation and its licensors retain all intellectual property and 
+* proprietary rights in and to this software and related documentation.
+* Any use, reproduction, disclosure, or distribution of this software
+* and related documentation without an express license agreement from 
+* NVIDIA Corporation is strictly prohibited.
+* 
+* Please refer to the applicable NVIDIA end user license agreement (EULA) 
+* associated with this source code for terms and conditions that govern 
+* your use of this NVIDIA software.
+* 
+*/
+
+//----------------------------------------------------------------------------
+// Scans each warp in parallel ("warp-scan"), one element per thread.
+// uses 2 numElements of shared memory per thread (64 = elements per warp)
+//----------------------------------------------------------------------------
+#define WARP_SIZE 32
+uint scanwarp(uint val, volatile __local uint* sData, int maxlevel)
+{
+    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
+    int localId = get_local_id(0);
+    int idx = 2 * localId - (localId & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    sData[idx] = val;     
+
+    if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; }
+    if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; }
+    if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; }
+    if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; }
+    if (4 <= maxlevel) { sData[idx] += sData[idx -16]; }
+
+    return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+//----------------------------------------------------------------------------
+// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
+// a warp-scan algorithm
+//----------------------------------------------------------------------------
+uint4 scan4(uint4 idata, __local uint* ptr)
+{    
+    
+    uint idx = get_local_id(0);
+
+    uint4 val4 = idata;
+    uint sum[3];
+    sum[0] = val4.x;
+    sum[1] = val4.y + sum[0];
+    sum[2] = val4.z + sum[1];
+    
+    uint val = val4.w + sum[2];
+    
+    val = scanwarp(val, ptr, 4);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + val4.w + sum[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	if (idx < WARP_SIZE)
+		ptr[idx] = scanwarp(ptr[idx], ptr, 2);
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    val += ptr[idx >> 5];
+
+    val4.x = val;
+    val4.y = val + sum[0];
+    val4.z = val + sum[1];
+    val4.w = val + sum[2];
+
+    return val4;
+}
+
+#ifdef __APPLE__
+__kernel uint4 rank4(uint4 preds, __local uint* sMem)
+#else
+uint4 rank4(uint4 preds, __local uint* sMem)
+#endif
+{
+	int localId = get_local_id(0);
+	int localSize = get_local_size(0);
+
+	uint4 address = scan4(preds, sMem);
+	
+	__local uint numtrue[1];
+	if (localId == localSize - 1) 
+	{
+		numtrue[0] = address.w + preds.w;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	uint4 rank;
+	int idx = localId*4;
+	rank.x = (preds.x) ? address.x : numtrue[0] + idx - address.x;
+	rank.y = (preds.y) ? address.y : numtrue[0] + idx + 1 - address.y;
+	rank.z = (preds.z) ? address.z : numtrue[0] + idx + 2 - address.z;
+	rank.w = (preds.w) ? address.w : numtrue[0] + idx + 3 - address.w;
+	
+	return rank;
+}
+
+void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
+{
+	int localId = get_local_id(0);
+    int localSize = get_local_size(0);
+	
+	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+	{
+		uint4 lsb;
+		lsb.x = !(((*key).x >> shift) & 0x1);
+		lsb.y = !(((*key).y >> shift) & 0x1);
+        lsb.z = !(((*key).z >> shift) & 0x1);
+        lsb.w = !(((*key).w >> shift) & 0x1);
+        
+		uint4 r;
+		
+		r = rank4(lsb, sMem);
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
+        sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
+        sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
+        sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // The above allows us to read without 4-way bank conflicts:
+        (*key).x = sMem[localId];
+        (*key).y = sMem[localId +     localSize];
+        (*key).z = sMem[localId + 2 * localSize];
+        (*key).w = sMem[localId + 3 * localSize];
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+}
+
+__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, 
+									  __global uint4* keysOut,
+									  uint nbits,
+									  uint startbit,
+									  uint numElements, 
+									  uint totalBlocks,
+									  __local uint* sMem)
+{
+	int globalId = get_global_id(0);
+	
+	uint4 key;
+	key = keysIn[globalId];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
+	
+	keysOut[globalId] = key;
+}
+
+//----------------------------------------------------------------------------
+// Given an array with blocks sorted according to a 4-bit radix group, each 
+// block counts the number of keys that fall into each radix in the group, and 
+// finds the starting offset of each radix in the block.  It then writes the radix 
+// counts to the counters array, and the starting offsets to the blockOffsets array.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//                                
+//----------------------------------------------------------------------------
+#if 0 //KeysOnly version
+__kernel void findRadixOffsets(__global uint2* keys,
+							   __global uint* counters,
+							   __global uint* blockOffsets,
+							   uint startbit,
+							   uint numElements,
+							   uint totalBlocks,
+							   __local uint* sRadix1)
+{
+	__local uint  sStartPointers[16];
+
+    uint groupId = get_group_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    uint2 radix2;
+
+    radix2 = keys[get_global_id(0)];
+        
+
+    sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
+    sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
+
+    // Finds the position where the sRadix1 entries differ and stores start 
+    // index for each radix.
+    if(localId < 16) 
+    {
+        sStartPointers[localId] = 0; 
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId]] = localId;
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
+    {
+        sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        blockOffsets[groupId*16 + localId] = sStartPointers[localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Compute the sizes of each block.
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId - 1]] = 
+            localId - sStartPointers[sRadix1[localId - 1]];
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
+    {
+        sStartPointers[sRadix1[localId + groupSize - 1]] = 
+            localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
+    }
+        
+
+    if(localId == groupSize - 1) 
+    {
+        sStartPointers[sRadix1[2 * groupSize - 1]] = 
+            2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        counters[localId * totalBlocks + groupId] = sStartPointers[localId];
+    }
+}
+#endif
+
+// a naive scan routine that works only for array that
+// can fit into a single block, just for debugging purpose,
+// not used in the sort now
+__kernel void scanNaive(__global uint *g_odata, 
+                        __global uint *g_idata, 
+                        uint n,
+                        __local uint* temp)
+{
+
+    int localId = get_local_id(0);
+
+    int pout = 0;
+    int pin = 1;
+
+    // Cache the computational window in shared memory
+    temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0;
+
+    for (int offset = 1; offset < n; offset *= 2)
+    {
+        pout = 1 - pout;
+        pin  = 1 - pout;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        temp[pout*n+localId] = temp[pin*n+localId];
+
+        if (localId >= offset)
+            temp[pout*n+localId] += temp[pin*n+localId - offset];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    g_odata[localId] = temp[pout*n+localId];
+}
+
+//----------------------------------------------------------------------------
+// reorderData shuffles data in the array globally after the radix offsets 
+// have been found. On compute version 1.1 and earlier GPUs, this code depends 
+// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+// 
+// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+// that all writes are coalesced using extra work in the kernel.  On later
+// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+// performance.  On these GPUs we set manualCoalesce=false and directly store
+// the results.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//----------------------------------------------------------------------------
+__kernel void reorderDataKeysOnly(__global uint  *outKeys, 
+                                  __global uint2  *keys, 
+                                  __global uint  *blockOffsets, 
+                                  __global uint  *offsets, 
+                                  __global uint  *sizes, 
+                                  uint startbit,
+                                  uint numElements,
+                                  uint totalBlocks,
+                                  __local uint2* sKeys2)
+{
+    __local uint sOffsets[16];
+    __local uint sBlockOffsets[16];
+
+    __local uint *sKeys1 = (__local uint*)sKeys2; 
+
+    uint groupId = get_group_id(0);
+
+	uint globalId = get_global_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    sKeys2[localId]   = keys[globalId];
+
+    if(localId < 16)  
+    {
+        sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
+        sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint radix = (sKeys1[localId] >> startbit) & 0xF;
+    uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId];
+    }
+
+    radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
+    globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId + groupSize];
+    }
+ 
+
+}
+
+//----------------------------------------------------------------------
+//#include "Radix_Keys_Values.cl"
+//----------------------------------------------------------------------
+
+void radixSortBlockKeysValues(uint4 *key, uint4 *value, uint nbits, uint startbit, __local uint* sMem, __local uint* sVMem)
+{
+	int localId = get_local_id(0);
+    int localSize = get_local_size(0);
+
+	
+	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+	{
+		uint4 lsb;
+		lsb.x = !(((*key).x >> shift) & 0x1);
+		lsb.y = !(((*key).y >> shift) & 0x1);
+        lsb.z = !(((*key).z >> shift) & 0x1);
+        lsb.w = !(((*key).w >> shift) & 0x1);
+        
+		uint4 r;
+        uint4 rv;
+		
+		r = rank4(lsb, sMem);
+		rv = rank4(lsb, sVMem);
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
+        sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
+        sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
+        sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
+
+        sVMem[(rv.x & 3) * localSize + (rv.x >> 2)] = (*value).x;
+        sVMem[(rv.y & 3) * localSize + (rv.y >> 2)] = (*value).y;
+        sVMem[(rv.z & 3) * localSize + (rv.z >> 2)] = (*value).z;
+        sVMem[(rv.w & 3) * localSize + (rv.w >> 2)] = (*value).w;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // The above allows us to read without 4-way bank conflicts:
+        (*key).x = sMem[localId];
+        (*key).y = sMem[localId +     localSize];
+        (*key).z = sMem[localId + 2 * localSize];
+        (*key).w = sMem[localId + 3 * localSize];
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        // The above allows us to read without 4-way bank conflicts:
+        (*value).x = sVMem[localId];
+        (*value).y = sVMem[localId +     localSize];
+        (*value).z = sVMem[localId + 2 * localSize];
+        (*value).w = sVMem[localId + 3 * localSize];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+		// I AM NOT CLEAR ON HOW TO HANDLE value ARRAY
+	}
+    //better not put things in seperate for-loops
+    //i don't quite understand what the radix sort is doing but the rank4 update
+    // seems to be using memory addresses to do calculations and we want to couple
+    // our values to our keys as closely as possible
+/*
+	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+	{
+		uint4 lsb;
+		lsb.x = !(((*key).x >> shift) & 0x1);
+		lsb.y = !(((*key).y >> shift) & 0x1);
+        lsb.z = !(((*key).z >> shift) & 0x1);
+        lsb.w = !(((*key).w >> shift) & 0x1);
+        
+		uint4 r;
+		
+		r = rank4(lsb, sVMem);
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sVMem[(r.x & 3) * localSize + (r.x >> 2)] = (*value).x;
+        sVMem[(r.y & 3) * localSize + (r.y >> 2)] = (*value).y;
+        sVMem[(r.z & 3) * localSize + (r.z >> 2)] = (*value).z;
+        sVMem[(r.w & 3) * localSize + (r.w >> 2)] = (*value).w;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // The above allows us to read without 4-way bank conflicts:
+        (*value).x = sVMem[localId];
+        (*value).y = sVMem[localId +     localSize];
+        (*value).z = sVMem[localId + 2 * localSize];
+        (*value).w = sVMem[localId + 3 * localSize];
+        barrier(CLK_LOCAL_MEM_FENCE);
+	}
+        */
+}
+
+//----------------------------------------------------------------------
+// ERROR: 
+__kernel void radixSortBlocksKeysValues(__global uint4* keysIn, 
+                                      __global uint4* valuesIn,
+									  __global uint4* keysOut,
+                                      __global uint4* valuesOut,
+									  uint nbits,
+									  uint startbit,
+									  uint numElements, 
+									  uint totalBlocks,
+									  __local uint* sMem,
+									  __local uint* sVMem)
+{
+	int globalId = get_global_id(0);
+	
+	uint4 key;
+	key = keysIn[globalId];
+
+	uint4 value;
+	value = valuesIn[globalId]; // not sure if needed
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	radixSortBlockKeysValues(&key, &value, nbits, startbit, sMem, sVMem);
+	
+	keysOut[globalId] = key;
+	valuesOut[globalId] = value; 
+	//valuesOut[globalId] = 3; // works
+}
+
+
+//----------------------------------------------------------------------------
+// reorderData shuffles data in the array globally after the radix offsets 
+// have been found. On compute version 1.1 and earlier GPUs, this code depends 
+// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+// 
+// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+// that all writes are coalesced using extra work in the kernel.  On later
+// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+// performance.  On these GPUs we set manualCoalesce=false and directly store
+// the results.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//----------------------------------------------------------------------------
+__kernel void reorderDataKeysValues(__global uint   *outKeys, 
+                                    __global uint   *outValues,
+                                    __global uint2  *keys, 
+                                    __global uint2  *values, 
+                                    __global uint   *blockOffsets, 
+                                    __global uint   *offsets, 
+                                    __global uint   *sizes, 
+                                    uint startbit,
+                                    uint numElements,
+                                    uint totalBlocks,
+                                    __local uint2* sKeys2, // Enough shared memory on card?
+                                    __local uint2* sValues2) // Do I need shared for values?
+{
+    __local uint sOffsets[16];
+    __local uint sBlockOffsets[16];
+
+    __local uint *sKeys1   = (__local uint*) sKeys2; 
+    __local uint *sValues1 = (__local uint*) sValues2; 
+
+    uint groupId = get_group_id(0);
+
+	uint globalId = get_global_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    sKeys2[localId]   = keys[globalId];
+    sValues2[localId] = values[globalId];
+
+    if(localId < 16)  
+    {
+        sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
+        sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint radix = (sKeys1[localId] >> startbit) & 0xF;
+    uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId];
+		outValues[globalOffset] = sValues1[localId];
+    }
+
+    radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
+    globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId + groupSize];
+		outValues[globalOffset] = sValues1[localId + groupSize];
+    }
+}
+//----------------------------------------------------------------------
+#if 1 //keys values
+__kernel void findRadixOffsets(__global uint2* keys,
+                               __global uint2* values,
+							   __global uint* counters,
+							   __global uint* blockOffsets,
+							   uint startbit,
+							   uint numElements,
+							   uint totalBlocks,
+							   __local uint* sRadix1)
+{
+	__local uint  sStartPointers[16];
+
+    uint groupId   = get_group_id(0);
+    uint localId   = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    uint2 radix2;
+
+    radix2 = keys[get_global_id(0)];
+
+    sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
+    sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
+
+    // Finds the position where the sRadix1 entries differ and stores start 
+    // index for each radix.
+    if(localId < 16) 
+    {
+        sStartPointers[localId] = 0; 
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId]] = localId;
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
+    {
+        sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        blockOffsets[groupId*16 + localId] = sStartPointers[localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Compute the sizes of each block.
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId - 1]] = 
+            localId - sStartPointers[sRadix1[localId - 1]];
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
+    {
+        sStartPointers[sRadix1[localId + groupSize - 1]] = 
+            localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
+    }
+        
+
+    if(localId == groupSize - 1) 
+    {
+        sStartPointers[sRadix1[2 * groupSize - 1]] = 
+            2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        counters[localId * totalBlocks + groupId] = sStartPointers[localId];
+    }
+}
+//----------------------------------------------------------------------
+#endif
diff --git a/rtpslib/system/common/cl_src/Scan_b.cl b/rtpslib/system/common/cl_src/Scan_b.cl
new file mode 100644
index 00000000..32fd4dd4
--- /dev/null
+++ b/rtpslib/system/common/cl_src/Scan_b.cl
@@ -0,0 +1,190 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//All three kernels run 512 threads per workgroup
+//Must be a power of two
+#define WORKGROUP_SIZE 256
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan codelets
+////////////////////////////////////////////////////////////////////////////////
+#if(1)
+    //Naive inclusive scan: O(N * log2(N)) operations
+    //Allocate 2 * 'size' local memory, initialize the first half
+    //with 'size' zeros avoiding if(pos >= offset) condition evaluation
+    //and saving instructions
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint t = l_Data[pos] + l_Data[pos - offset];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            l_Data[pos] = t;
+        }
+
+        return l_Data[pos];
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+
+#else
+    #define LOG2_WARP_SIZE 5U
+    #define      WARP_SIZE (1U << LOG2_WARP_SIZE)
+
+    //Almost the same as naiveScan1 but doesn't need barriers
+    //assuming size <= WARP_SIZE
+    inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1)
+            l_Data[pos] += l_Data[pos - offset];
+
+        return l_Data[pos];
+    }
+
+    inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
+        return warpScanInclusive(idata, l_Data, size) - idata;
+    }
+
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        if(size > WARP_SIZE){
+            //Bottom-level inclusive warp scan
+            uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because l_Data is being overwritten)
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
+                l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
+
+            //wait for warp scans to complete
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
+                //grab top warp elements
+                uint val = l_Data[get_local_id(0)];
+                //calculate exclsive scan and write back to shared memory
+                l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
+            }
+
+            //return updated warp scans with exclusive scan results
+            barrier(CLK_LOCAL_MEM_FENCE);
+            return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
+        }else{
+            return warpScanInclusive(idata, l_Data, size);
+        }
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+#endif
+
+
+//Vector scan: the array to be scanned is stored
+//in work-item private memory as uint4
+inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
+    //Level-0 inclusive scan
+    data4.y += data4.x;
+    data4.z += data4.y;
+    data4.w += data4.z;
+
+    //Level-1 exclusive scan
+    uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
+
+    return (data4 + (uint4)val);
+}
+
+inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
+    return scan4Inclusive(data4, l_Data, size) - data4;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan kernels
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal1(
+    __global uint4 *d_Dst,
+    __global uint4 *d_Src,
+    __local uint* l_Data,
+    uint size
+){
+    //Load data
+    uint4 idata4 = d_Src[get_global_id(0)];
+
+    //Calculate exclusive scan
+    uint4 odata4  = scan4Exclusive(idata4, l_Data, size);
+
+    //Write back
+    d_Dst[get_global_id(0)] = odata4;
+}
+
+//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal2(
+    __global uint *d_Buf,
+    __global uint *d_Dst,
+    __global uint *d_Src,
+    __local uint* l_Data,
+    uint N,
+    uint arrayLength
+){
+    //Load top elements
+    //Convert results of bottom-level scan back to inclusive
+    //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
+    uint data = 0;
+    if(get_global_id(0) < N)
+    data =
+        d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + 
+        d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
+
+    //Compute
+    uint odata = scan1Exclusive(data, l_Data, arrayLength);
+
+    //Avoid out-of-bound access
+    if(get_global_id(0) < N)
+        d_Buf[get_global_id(0)] = odata;
+}
+
+//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void uniformUpdate(
+    __global uint4 *d_Data,
+    __global uint *d_Buf
+){
+    __local uint buf[1];
+
+    uint4 data4 = d_Data[get_global_id(0)];
+
+    if(get_local_id(0) == 0)
+        buf[0] = d_Buf[get_group_id(0)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    data4 += (uint4)buf[0];
+    d_Data[get_global_id(0)] = data4;
+}