Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Version 2

  • Loading branch information...
commit e718137e00cf1f7deafd31dfe7176451026e0038 1 parent 267fa52
Valdas authored
0  ber_gpu/template_gold.cpp → ber_gpu/C_gold.cpp
File renamed without changes
56 ber_gpu/template_kernel.cu → ber_gpu/gpu_kernel.cu
@@ -9,33 +9,38 @@
9 9 *
10 10 */
11 11
12   -/* Template project which demonstrates the basics on how to setup a project
13   - * example application.
14   - * Device code.
15   - */
16 12
17 13 #ifndef _TEMPLATE_KERNEL_H_
18 14 #define _TEMPLATE_KERNEL_H_
19 15
20 16 #include <cuComplex.h>
  17 +#include <stdio.h>
  18 +#include <stdlib.h>
  19 +#include <cuda.h>
21 20 #include <math_functions.h>
  21 +#include <curand_kernel.h>
  22 +#include "cuda_runtime.h"
  23 +#include "device_launch_parameters.h"
  24 +#include "device_functions.h"
22 25
23   -////////////////////////////////////////////////////////////////////////////////
24   -//! Simple test kernel for device functionality
25   -//! @param g_idata input data in global memory
26   -//! @param g_odata output data in global memory
27   -////////////////////////////////////////////////////////////////////////////////
28   -
29   -__device__ __forceinline__ int countBitsDev(int i)
  26 +__device__ __forceinline__ unsigned int countBitsDev(unsigned int v)
30 27 {
31   - i = i - ((i >> 1) & 0x55555555);
32   - i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
33   - return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
  28 + unsigned int c;
  29 + for (c = 0; v; c++)
  30 + {
  31 + v &= v - 1; // clear the least significant bit set
  32 + }
  33 + return c;
34 34 }
35 35
  36 +__device__ __forceinline__ int quantisizeDev(float x)
  37 +{
  38 + int res=(int)floor(x)+2;
  39 + res=max(0,res);
  40 + return min(3,res);
  41 +}
36 42
37   -template<class type>
38   -__device__ __forceinline__ int quantisizeDev(type x)
  43 +__device__ __forceinline__ int quantisizeDev(double x)
39 44 {
40 45 int res=(int)floor(x)+2;
41 46 res=max(0,res);
@@ -61,12 +66,12 @@ kernelDouble(curandState * state, const unsigned int iter, const double awgn_sig
61 66 int id = threadIdx.x + blockIdx.x * blockDim.x;
62 67 /* Copy state to local memory for efficiency */
63 68 curandState ls = state[id];
  69 + unsigned int local_distance=0;
64 70 cuDoubleComplex signal, noise;
65 71 int input, output;
66   - distance[id]=0;
67   - for (int i=0; i<iter;i++) {
  72 + for (unsigned int i=0; i<iter;i++) {
68 73 input = curand(&ls) & 15; //same as %16
69   - signal = make_cuDoubleComplex((double)(input & 3)-1.5,(double)(input >> 2)-1.5);
  74 + signal = make_cuDoubleComplex((input & 3)-1.5,(input >> 2)-1.5);
70 75
71 76 sincos(curand_normal_double(&ls)*phase_sigma, &noise.y, &noise.x);
72 77 signal = cuCmul(signal,noise);
@@ -77,9 +82,10 @@ kernelDouble(curandState * state, const unsigned int iter, const double awgn_sig
77 82 signal = cuCadd(signal,noise);
78 83
79 84 output = quantisizeDev(signal.x) + quantisizeDev(signal.y)*4;
80   - distance[id] += bitTable[output ^ input];
  85 + local_distance += bitTable[output ^ input];
81 86 }
82 87 state[id] = ls;
  88 + distance[id] = local_distance;
83 89 }
84 90
85 91 __global__ void
@@ -88,12 +94,12 @@ kernelFloat(curandState * state, const unsigned int iter, const float awgn_sigma
88 94 int id = threadIdx.x + blockIdx.x * blockDim.x;
89 95 /* Copy state to local memory for efficiency */
90 96 curandState ls = state[id];
  97 + unsigned int local_distance=0;
91 98 cuComplex signal, noise;
92 99 int input, output;
93   - distance[id]=0;
94   - for (int i=0; i<iter;i++) {
  100 + for (unsigned int i=0; i<iter;i++) {
95 101 input = curand(&ls) & 15; //same as %16
96   - signal = make_cuComplex((float)(input & 3)-1.5,(float)(input >> 2)-1.5);
  102 + signal = make_cuComplex((input & 3)-1.5,(input >> 2)-1.5);
97 103
98 104 sincosf(curand_normal(&ls)*phase_sigma, &noise.y, &noise.x);
99 105 signal = cuCmulf(signal,noise);
@@ -104,9 +110,11 @@ kernelFloat(curandState * state, const unsigned int iter, const float awgn_sigma
104 110 signal = cuCaddf(signal,noise);
105 111
106 112 output = quantisizeDev(signal.x) + quantisizeDev(signal.y)*4;
107   - distance[id] += bitTable[output ^ input];
  113 + local_distance += bitTable[output ^ input];
108 114 }
  115 +
109 116 state[id] = ls;
  117 + distance[id] = local_distance;
110 118 }
111 119
112 120 #endif // #ifndef _TEMPLATE_KERNEL_H_
45 ber_gpu/template.cu → ber_gpu/main_results.cu
@@ -32,7 +32,7 @@
32 32 #include <shrUtils.h>
33 33
34 34 // includes, kernels
35   -#include <template_kernel.cu>
  35 +#include <gpu_kernel.cu>
36 36
37 37 extern "C"
38 38 double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
@@ -197,17 +197,6 @@ void gpuCompute(int argc, char** argv)
197 197 fclose(file);
198 198 sdkStopTimer( &timerTotal);
199 199
200   - printf("----------------\n");
201   - if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
202   - printf("----------------\n");
203   - printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
204   - printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
205   - printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
206   - printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
207   - printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
208   - //printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
209   -
210   -
211 200 /* Cleanup */
212 201 checkCudaErrors ( cudaFree ( devStates ));
213 202 checkCudaErrors ( cudaFree ( devResults ));
@@ -221,30 +210,6 @@ void gpuCompute(int argc, char** argv)
221 210 sdkDeleteTimer( &timerSetup );
222 211 }
223 212
224   -void seqCompute(int argc, char** argv) {
225   - StopWatchInterface *timer = 0;
226   - StopWatchInterface *timer2 = 0;
227   -
228   - sdkCreateTimer( &timer );
229   - sdkStartTimer( &timer );
230   - double resultDouble = computeGoldDouble(n*num_threads*num_blocks,(double)1,(double)0.16);
231   - sdkStopTimer( &timer );
232   -
233   - sdkCreateTimer( &timer2 );
234   - sdkStartTimer( &timer2 );
235   - double resultFloat = computeGoldFloat(n*num_threads*num_blocks,(float)1,(float)0.16);
236   - sdkStopTimer( &timer2 );
237   -
238   -
239   - printf("----------------\n");
240   - printf("CPU versions:\n");
241   - printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
242   - printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
243   -
244   - sdkDeleteTimer( &timer );
245   - sdkDeleteTimer( &timer2 );
246   -
247   -}
248 213
249 214 ////////////////////////////////////////////////////////////////////////////////
250 215 // Program main
@@ -255,13 +220,11 @@ main( int argc, char** argv)
255 220 num_blocks = 16;
256 221 num_threads = 256;
257 222 kernelIter = 1;
258   -
259   - //runTest( argc, argv);
260   - //seqCompute(argc, argv);
  223 +
261 224 if (argc >= 3) {
262 225 n = atoi(argv[2]);
263   - printf("Config: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i Total:%lu\n",num_blocks,num_threads,n,kernelIter,(unsigned long long) num_blocks*num_threads*n*kernelIter);
  226 + printf("Config: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i Total N:%lu\n",num_blocks,num_threads,n,kernelIter,(unsigned long long) num_blocks*num_threads*n*kernelIter);
264 227 gpuCompute(argc, argv);
265 228 }
266   - //std::cin(0);
  229 +
267 230 }
211 ber_gpu/main_timing.cu
... ... @@ -0,0 +1,211 @@
  1 +/*
  2 + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
  3 + *
  4 + * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 + * with this source code for terms and conditions that govern your use of
  6 + * this software. Any use, reproduction, disclosure, or distribution of
  7 + * this software and related documentation outside the terms of the EULA
  8 + * is strictly prohibited.
  9 + *
  10 + */
  11 +
  12 +/* Template project which demonstrates the basics on how to setup a project
  13 +* example application.
  14 +* Host code.
  15 +*/
  16 +
  17 +// includes, system
  18 +#include <stdlib.h>
  19 +#include <stdio.h>
  20 +#include <string.h>
  21 +#include <math.h>
  22 +#include <iostream>
  23 +
  24 +// includes CUDA
  25 +#include <cuda.h>
  26 +#include <curand_kernel.h>
  27 +#include <cuda_runtime.h>
  28 +
  29 +// includes, project
  30 +#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples
  31 +#include <shrQATest.h> // This is for automated testing output (--qatest)
  32 +#include <shrUtils.h>
  33 +
  34 +// includes, kernels
  35 +#include <gpu_kernel.cu>
  36 +
  37 +////////////////////////////////////////////////////////////////////////////////
  38 +// declaration, forward
  39 +void runTest( int argc, char** argv);
  40 +
  41 +extern "C"
  42 +double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
  43 +extern "C"
  44 +double computeGoldFloat( const int n, const float awgn_sigma, const float phase_sigma);
  45 +
  46 +
  47 +int num_blocks = 32;
  48 +int num_threads = 256;
  49 +unsigned int n = 80000;
  50 +int kernelIter = 1;
  51 +////////////////////////////////////////////////////////////////////////////////
  52 +// These are CUDA Helper functions
  53 +
  54 +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
  55 +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
  56 +
  57 +inline void __checkCudaErrors(cudaError err, const char *file, const int line )
  58 +{
  59 + if(cudaSuccess != err)
  60 + {
  61 + fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
  62 + exit(-1);
  63 + }
  64 +}
  65 +
  66 +// This will output the proper error string when calling cudaGetLastError
  67 +#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
  68 +
  69 +inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )
  70 +{
  71 + cudaError_t err = cudaGetLastError();
  72 + if (cudaSuccess != err)
  73 + {
  74 + fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
  75 + file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
  76 + exit(-1);
  77 + }
  78 +}
  79 +
  80 +void gpuCompute(int chooseKernel)
  81 +{
  82 + unsigned int i;
  83 + unsigned long long total = 0;
  84 + curandState *devStates;
  85 + unsigned int * devResults , *hostResults, *devBitTable;
  86 + double awgn_sigma = 1;
  87 + double phase_sigma = 0.16;
  88 +
  89 + StopWatchInterface *timerKernel = 0;
  90 + StopWatchInterface *timerMemory = 0;
  91 + StopWatchInterface *timerAdd = 0;
  92 + StopWatchInterface *timerTotal = 0;
  93 + StopWatchInterface *timerSetup = 0;
  94 + sdkCreateTimer( &timerKernel );
  95 + sdkCreateTimer( &timerMemory);
  96 + sdkCreateTimer( &timerAdd );
  97 + sdkCreateTimer( &timerTotal );
  98 + sdkCreateTimer( &timerSetup );
  99 +
  100 + cudaDeviceReset(); //need to get constant times between runs
  101 + sdkStartTimer( &timerTotal );
  102 +
  103 + sdkStartTimer( &timerSetup );
  104 + /* Allocate space for results on device */
  105 + checkCudaErrors( cudaMalloc (( void **)&devResults , num_blocks * num_threads * sizeof (unsigned int)));
  106 +
  107 + /* Allocate space for prng states on device */
  108 + checkCudaErrors ( cudaMalloc (( void **)&devStates , num_blocks * num_threads * sizeof ( curandState ))); //48 bytes
  109 + /* Allocate space for lookup table on device */
  110 + checkCudaErrors ( cudaMalloc (( void **)&devBitTable , 16*sizeof(unsigned int)));
  111 +
  112 + /* Setup prng states */
  113 + setup_kernel<<<num_blocks, num_threads>>>( devStates, devBitTable );
  114 + /* Allocate space for results on host */
  115 + hostResults = (unsigned int *) calloc (num_blocks * num_threads, sizeof (unsigned int));
  116 +
  117 +
  118 + //sync for correct timing
  119 + checkCudaErrors( cudaDeviceSynchronize() );
  120 +
  121 + sdkStopTimer( &timerSetup );
  122 +
  123 + sdkStartTimer( &timerKernel );
  124 + for (int j=0; j<kernelIter; j++) {
  125 + /* Copy device memory to host */
  126 + if(chooseKernel) {
  127 + kernelDouble<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
  128 + }
  129 + else {
  130 + kernelFloat<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
  131 + }
  132 + checkCudaErrors( cudaDeviceSynchronize() );
  133 +
  134 + }
  135 + sdkStopTimer( &timerKernel);
  136 +
  137 + sdkStartTimer( &timerMemory );
  138 + checkCudaErrors ( cudaMemcpy ( hostResults , devResults , num_blocks*num_threads*sizeof(unsigned int), cudaMemcpyDeviceToHost ));
  139 + checkCudaErrors( cudaDeviceSynchronize() );
  140 + /* Show result */
  141 + sdkStopTimer( &timerMemory);
  142 + sdkStartTimer( &timerAdd );
  143 + for(i = 0; i < num_blocks * num_threads; i++) {
  144 + total += hostResults [i];
  145 + }
  146 + double result = ( double ) total / (num_blocks * num_threads * n * kernelIter );
  147 +
  148 + sdkStopTimer( &timerAdd);
  149 + sdkStopTimer( &timerTotal);
  150 +
  151 + printf("----------------\n");
  152 + if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
  153 + printf("----------------\n");
  154 + printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
  155 + printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
  156 + printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
  157 + printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
  158 + printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
  159 + printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
  160 +
  161 +
  162 + /* Cleanup */
  163 + checkCudaErrors ( cudaFree ( devStates ));
  164 + checkCudaErrors ( cudaFree ( devResults ));
  165 + checkCudaErrors ( cudaFree ( devBitTable ));
  166 + free ( hostResults );
  167 +
  168 + sdkDeleteTimer( &timerKernel );
  169 + sdkDeleteTimer( &timerMemory);
  170 + sdkDeleteTimer( &timerAdd );
  171 + sdkDeleteTimer( &timerTotal );
  172 + sdkDeleteTimer( &timerSetup );
  173 +}
  174 +
  175 +void seqCompute(int argc, char** argv) {
  176 + StopWatchInterface *timer = 0;
  177 + StopWatchInterface *timer2 = 0;
  178 +
  179 + sdkCreateTimer( &timer );
  180 + sdkStartTimer( &timer );
  181 + double resultDouble = computeGoldDouble(n*num_threads*num_blocks,1,0.16);
  182 + sdkStopTimer( &timer );
  183 +
  184 + sdkCreateTimer( &timer2 );
  185 + sdkStartTimer( &timer2 );
  186 + double resultFloat = computeGoldFloat(n*num_threads*num_blocks,1,0.16);
  187 + sdkStopTimer( &timer2 );
  188 +
  189 +
  190 + printf("----------------\n");
  191 + printf("CPU versions:\n");
  192 + printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
  193 + printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
  194 +
  195 + sdkDeleteTimer( &timer );
  196 + sdkDeleteTimer( &timer2 );
  197 +
  198 +}
  199 +
  200 +////////////////////////////////////////////////////////////////////////////////
  201 +// Program main
  202 +////////////////////////////////////////////////////////////////////////////////
  203 +int
  204 +main( int argc, char** argv)
  205 +{
  206 + printf("Config: Blocks:%i Threads/Block:%i, n:%i, Actual N:%lu\n",num_blocks,num_threads,n,(unsigned long long) num_blocks*num_threads*n*kernelIter);
  207 + seqCompute(argc, argv);
  208 + gpuCompute(0);
  209 + gpuCompute(1);
  210 +
  211 +}
459 ber_gpu/template_time.cu
... ... @@ -1,459 +0,0 @@
1   -/*
2   - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3   - *
4   - * Please refer to the NVIDIA end user license agreement (EULA) associated
5   - * with this source code for terms and conditions that govern your use of
6   - * this software. Any use, reproduction, disclosure, or distribution of
7   - * this software and related documentation outside the terms of the EULA
8   - * is strictly prohibited.
9   - *
10   - */
11   -
12   -/* Template project which demonstrates the basics on how to setup a project
13   -* example application.
14   -* Host code.
15   -*/
16   -
17   -// includes, system
18   -#include <stdlib.h>
19   -#include <stdio.h>
20   -#include <string.h>
21   -#include <math.h>
22   -#include <iostream>
23   -
24   -// includes CUDA
25   -#include <cuda.h>
26   -#include <curand_kernel.h>
27   -#include <cuda_runtime.h>
28   -
29   -// includes, project
30   -#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples
31   -#include <shrQATest.h> // This is for automated testing output (--qatest)
32   -#include <shrUtils.h>
33   -
34   -// includes, kernels
35   -#include <template_kernel.cu>
36   -
37   -////////////////////////////////////////////////////////////////////////////////
38   -// declaration, forward
39   -void runTest( int argc, char** argv);
40   -
41   -extern "C"
42   -double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
43   -extern "C"
44   -double computeGoldFloat( const int n, const float awgn_sigma, const float phase_sigma);
45   -
46   -
47   -int num_blocks = 16;
48   -int num_threads = 384;
49   -unsigned int n = 53332*2;
50   -int kernelIter = 1;
51   -////////////////////////////////////////////////////////////////////////////////
52   -// These are CUDA Helper functions
53   -
54   -// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
55   -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
56   -
57   -inline void __checkCudaErrors(cudaError err, const char *file, const int line )
58   -{
59   - if(cudaSuccess != err)
60   - {
61   - fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
62   - exit(-1);
63   - }
64   -}
65   -
66   -// This will output the proper error string when calling cudaGetLastError
67   -#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
68   -
69   -inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )
70   -{
71   - cudaError_t err = cudaGetLastError();
72   - if (cudaSuccess != err)
73   - {
74   - fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
75   - file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
76   - exit(-1);
77   - }
78   -}
79   -
80   -// General GPU Device CUDA Initialization
81   -int gpuDeviceInit(int devID)
82   -{
83   - int deviceCount;
84   - checkCudaErrors(cudaGetDeviceCount(&deviceCount));
85   -
86   - if (deviceCount == 0)
87   - {
88   - fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
89   - exit(-1);
90   - }
91   -
92   - if (devID < 0)
93   - devID = 0;
94   -
95   - if (devID > deviceCount-1)
96   - {
97   - fprintf(stderr, "\n");
98   - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
99   - fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
100   - fprintf(stderr, "\n");
101   - return -devID;
102   - }
103   -
104   - cudaDeviceProp deviceProp;
105   - checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
106   -
107   - if (deviceProp.major < 1)
108   - {
109   - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
110   - exit(-1);
111   - }
112   -
113   - checkCudaErrors( cudaSetDevice(devID) );
114   - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
115   -
116   - return devID;
117   -}
118   -
119   -// This function returns the best GPU (with maximum GFLOPS)
120   -int gpuGetMaxGflopsDeviceId()
121   -{
122   - int current_device = 0, sm_per_multiproc = 0;
123   - int max_compute_perf = 0, max_perf_device = 0;
124   - int device_count = 0, best_SM_arch = 0;
125   - cudaDeviceProp deviceProp;
126   - cudaGetDeviceCount( &device_count );
127   -
128   - // Find the best major SM Architecture GPU device
129   - while (current_device < device_count)
130   - {
131   - cudaGetDeviceProperties( &deviceProp, current_device );
132   - if (deviceProp.major > 0 && deviceProp.major < 9999)
133   - {
134   - best_SM_arch = MAX(best_SM_arch, deviceProp.major);
135   - }
136   - current_device++;
137   - }
138   -
139   - // Find the best CUDA capable GPU device
140   - current_device = 0;
141   - while( current_device < device_count )
142   - {
143   - cudaGetDeviceProperties( &deviceProp, current_device );
144   - if (deviceProp.major == 9999 && deviceProp.minor == 9999)
145   - {
146   - sm_per_multiproc = 1;
147   - }
148   - else
149   - {
150   - sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
151   - }
152   -
153   - int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
154   -
155   - if( compute_perf > max_compute_perf )
156   - {
157   - // If we find GPU with SM major > 2, search only these
158   - if ( best_SM_arch > 2 )
159   - {
160   - // If our device==dest_SM_arch, choose this, or else pass
161   - if (deviceProp.major == best_SM_arch)
162   - {
163   - max_compute_perf = compute_perf;
164   - max_perf_device = current_device;
165   - }
166   - }
167   - else
168   - {
169   - max_compute_perf = compute_perf;
170   - max_perf_device = current_device;
171   - }
172   - }
173   - ++current_device;
174   - }
175   - return max_perf_device;
176   -}
177   -
178   -
179   -// Initialization code to find the best CUDA Device
180   -int findCudaDevice(int argc, const char **argv)
181   -{
182   - cudaDeviceProp deviceProp;
183   - int devID = 0;
184   - // If the command-line has a device number specified, use it
185   - if (checkCmdLineFlag(argc, argv, "device"))
186   - {
187   - devID = getCmdLineArgumentInt(argc, argv, "device=");
188   - if (devID < 0)
189   - {
190   - printf("Invalid command line parameter\n ");
191   - exit(-1);
192   - }
193   - else
194   - {
195   - devID = gpuDeviceInit(devID);
196   - if (devID < 0)
197   - {
198   - printf("exiting...\n");
199   - shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
200   - exit(-1);
201   - }
202   - }
203   - }
204   - else
205   - {
206   - // Otherwise pick the device with highest Gflops/s
207   - devID = gpuGetMaxGflopsDeviceId();
208   - checkCudaErrors( cudaSetDevice( devID ) );
209   - checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
210   - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
211   - }
212   - return devID;
213   -}
214   -// end of CUDA Helper Functions
215   -
216   -
217   -
218   -////////////////////////////////////////////////////////////////////////////////
219   -//! Run a simple test for CUDA
220   -////////////////////////////////////////////////////////////////////////////////
221   -void
222   -runTest( int argc, char** argv)
223   -{
224   - bool bTestResult = true;
225   -
226   - shrQAStart(argc, argv);
227   -
228   - // use command-line specified CUDA device, otherwise use device with highest Gflops/s
229   - int devID = findCudaDevice(argc, (const char**)argv);
230   -
231   - StopWatchInterface *timer = 0;
232   - sdkCreateTimer( &timer );
233   - sdkStartTimer( &timer );
234   -
235   - unsigned int num_threads = 32;
236   - unsigned int mem_size = sizeof( float) * num_threads;
237   -
238   - // allocate host memory
239   - float* h_idata = (float*) malloc( mem_size);
240   - // initalize the memory
241   - for( unsigned int i = 0; i < num_threads; ++i)
242   - {
243   - h_idata[i] = (float) i;
244   - }
245   -
246   - // allocate device memory
247   - float* d_idata;
248   - checkCudaErrors( cudaMalloc( (void**) &d_idata, mem_size) );
249   - // copy host memory to device
250   - checkCudaErrors( cudaMemcpy( d_idata, h_idata, mem_size,
251   - cudaMemcpyHostToDevice) );
252   -
253   - // allocate device memory for result
254   - float* d_odata;
255   - checkCudaErrors( cudaMalloc( (void**) &d_odata, mem_size));
256   -
257   - // setup execution parameters
258   - dim3 grid( 1, 1, 1);
259   - dim3 threads( num_threads, 1, 1);
260   -
261   - // execute the kernel
262   -// testKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);
263   -
264   - // check if kernel execution generated and error
265   - getLastCudaError("Kernel execution failed");
266   -
267   - // allocate mem for the result on host side
268   - float* h_odata = (float*) malloc( mem_size);
269   - // copy result from device to host
270   - checkCudaErrors( cudaMemcpy( h_odata, d_odata, sizeof( float) * num_threads,
271   - cudaMemcpyDeviceToHost) );
272   -
273   - sdkStopTimer( &timer );
274   - printf( "Processing time: %f (ms)\n", sdkGetTimerValue( &timer ) );
275   - sdkDeleteTimer( &timer );
276   -
277   - // compute reference solution
278   - float* reference = (float*) malloc( mem_size);
279   - //computeGold( reference, h_idata, num_threads);
280   -
281   - // check result
282   - if( checkCmdLineFlag( argc, (const char**) argv, "regression") )
283   - {
284   - // write file for regression test
285   - sdkWriteFile( "./data/regression.dat", h_odata, num_threads, 0.0f, false );
286   - }
287   - else
288   - {
289   - // custom output handling when no regression test running
290   - // in this case check if the result is equivalent to the expected soluion
291   - bTestResult = compareData( reference, h_odata, num_threads, 0.0f, 0.0f );
292   - }
293   - // cleanup memory
294   - free( h_idata );
295   - free( h_odata );
296   - free( reference );
297   - checkCudaErrors(cudaFree(d_idata));
298   - checkCudaErrors(cudaFree(d_odata));
299   -
300   - cudaDeviceReset();
301   - shrQAFinishExit(argc, (const char **)argv, (bTestResult ? QA_PASSED : QA_FAILED) );
302   -}
303   -
304   -void gpuCompute(int chooseKernel)
305   -{
306   -
307   -
308   -
309   - unsigned int i;
310   - unsigned long long total = 0;
311   - curandState *devStates;
312   - unsigned int * devResults , *hostResults, *devBitTable;
313   - double awgn_sigma = 1;
314   - //double awgn_sigma = pow(2.0,-7);
315   - double phase_sigma = 0.16;
316   - //double phase_sigma = 0.04;
317   -
318   - StopWatchInterface *timerKernel = 0;
319   - StopWatchInterface *timerMemory = 0;
320   - StopWatchInterface *timerAdd = 0;
321   - StopWatchInterface *timerTotal = 0;
322   - StopWatchInterface *timerSetup = 0;
323   - sdkCreateTimer( &timerKernel );
324   - sdkCreateTimer( &timerMemory);
325   - sdkCreateTimer( &timerAdd );
326   - sdkCreateTimer( &timerTotal );
327   - sdkCreateTimer( &timerSetup );
328   -
329   - cudaDeviceReset(); //need to get constant times between runs
330   - sdkStartTimer( &timerTotal );
331   -
332   - sdkStartTimer( &timerSetup );
333   - /* Allocate space for results on device */
334   - checkCudaErrors( cudaMalloc (( void **)&devResults , num_blocks * num_threads * sizeof (unsigned int)));
335   -
336   - /* Allocate space for prng states on device */
337   - checkCudaErrors ( cudaMalloc (( void **)&devStates , num_blocks * num_threads * sizeof ( curandState ))); //48 bytes
338   - /* Allocate space for lookup table on device */
339   - checkCudaErrors ( cudaMalloc (( void **)&devBitTable , 16*sizeof(unsigned int)));
340   -
341   - /* Setup prng states */
342   - setup_kernel<<<num_blocks, num_threads>>>( devStates, devBitTable );
343   - /* Allocate space for results on host */
344   - hostResults = (unsigned int *) calloc (num_blocks * num_threads, sizeof (unsigned int));
345   -
346   -
347   - //initialise device result memory to 0
348   - checkCudaErrors( cudaMemset ( devResults , 0, num_blocks * num_threads * sizeof (unsigned int)));
349   - //sync for correct timing
350   - checkCudaErrors( cudaDeviceSynchronize() );
351   -
352   - sdkStopTimer( &timerSetup );
353   -
354   - sdkStartTimer( &timerKernel );
355   - for (int j=0; j<kernelIter; j++) {
356   - /* Copy device memory to host */
357   - if(chooseKernel) {
358   - kernelDouble<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
359   - }
360   - else {
361   - kernelFloat<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
362   - }
363   - checkCudaErrors( cudaDeviceSynchronize() );
364   -
365   - }
366   - sdkStopTimer( &timerKernel);
367   -
368   - sdkStartTimer( &timerMemory );
369   - checkCudaErrors ( cudaMemcpy ( hostResults , devResults , num_blocks*num_threads*sizeof(unsigned int), cudaMemcpyDeviceToHost ));
370   - checkCudaErrors( cudaDeviceSynchronize() );
371   - /* Show result */
372   - sdkStopTimer( &timerMemory);
373   - sdkStartTimer( &timerAdd );
374   - for(i = 0; i < num_blocks * num_threads; i++) {
375   - total += hostResults [i];
376   - }
377   - double result = ( double ) total / (num_blocks * num_threads * n * kernelIter );
378   -
379   - sdkStopTimer( &timerAdd);
380   - sdkStopTimer( &timerTotal);
381   -
382   - printf("----------------\n");
383   - if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
384   - printf("----------------\n");
385   - printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
386   - printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
387   - printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
388   - printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
389   - printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
390   - printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
391   -
392   -
393   - /* Cleanup */
394   - checkCudaErrors ( cudaFree ( devStates ));
395   - checkCudaErrors ( cudaFree ( devResults ));
396   - checkCudaErrors ( cudaFree ( devBitTable ));
397   - free ( hostResults );
398   -
399   - sdkDeleteTimer( &timerKernel );
400   - sdkDeleteTimer( &timerMemory);
401   - sdkDeleteTimer( &timerAdd );
402   - sdkDeleteTimer( &timerTotal );
403   - sdkDeleteTimer( &timerSetup );
404   -}
405   -
406   -void seqCompute(int argc, char** argv) {
407   - StopWatchInterface *timer = 0;
408   - StopWatchInterface *timer2 = 0;
409   - /*
410   - sdkCreateTimer( &timer );
411   - sdkStartTimer( &timer );
412   -
413   - double[81] phase_sigma;
414   - for (int k=0; k<81; k++) {
415   - phase_sigma[k] =
416   - }
417   - for j=1:length(phase_sigma)
418   - for i=1:length(awgn_sigma)
419   - error_rate(i,j)=ber_test(n,awgn_sigma(i),phase_sigma(j));
420   - end
421   - */
422   -
423   - //awgn_sigma=2^(-8:0.1:0);
424   - //phase_sigma=[0.01,0.04,0.08,0.16];
425   - sdkCreateTimer( &timer );
426   - sdkStartTimer( &timer );
427   - double resultDouble = computeGoldDouble(n*num_threads*num_blocks,1,0.16);
428   - sdkStopTimer( &timer );
429   -
430   - sdkCreateTimer( &timer2 );
431   - sdkStartTimer( &timer2 );
432   - double resultFloat = computeGoldFloat(n*num_threads*num_blocks,1,0.16);
433   - sdkStopTimer( &timer2 );
434   -
435   -
436   - printf("----------------\n");
437   - printf("CPU versions:\n");
438   - printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
439   - printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
440   -
441   - sdkDeleteTimer( &timer );
442   - sdkDeleteTimer( &timer2 );
443   -
444   -}
445   -
446   -////////////////////////////////////////////////////////////////////////////////
447   -// Program main
448   -////////////////////////////////////////////////////////////////////////////////
449   -int
450   -main( int argc, char** argv)
451   -{
452   - shrQAStart(argc, argv);
453   - printf("GPU configuration: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i \n",num_blocks,num_threads,n,kernelIter);
454   - //runTest( argc, argv);
455   - //seqCompute(argc, argv);
456   - gpuCompute(0);
457   - gpuCompute(1);
458   - //std::cin(0);
459   -}
14 ber_gpu/template_vs2010.vcxproj
@@ -197,22 +197,16 @@
197 197 </CudaCompile>
198 198 </ItemDefinitionGroup>
199 199 <ItemGroup>
200   - <CudaCompile Include="template.cu" />
201   - <CudaCompile Include="template_time.cu">
202   - <Include Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">./;../../common/inc;../../../shared/inc</Include>
203   - <Include Condition="'$(Configuration)|$(Platform)'=='Release|x64'">./;../../common/inc;../../../shared/inc</Include>
204   - <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10;compute_20,sm_20</CodeGeneration>
205   - <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10;compute_20,sm_20</CodeGeneration>
206   - <TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">64</TargetMachinePlatform>
207   - <TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Release|x64'">64</TargetMachinePlatform>
  200 + <CudaCompile Include="main_results.cu">
208 201 <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
209 202 </CudaCompile>
210   - <CudaCompile Include="template_kernel.cu">
  203 + <CudaCompile Include="gpu_kernel.cu">
211 204 <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
212 205 </CudaCompile>
  206 + <CudaCompile Include="main_timing.cu" />
213 207 </ItemGroup>
214 208 <ItemGroup>
215   - <ClCompile Include="template_gold.cpp" />
  209 + <ClCompile Include="C_gold.cpp" />
216 210 </ItemGroup>
217 211 <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
218 212 <ImportGroup Label="ExtensionTargets">
BIN  ber_gpu/vc100.pdb
Binary file not shown
9 ber_matlab/script.m
... ... @@ -1,11 +1,14 @@
  1 +tic;
  2 +ber_test(256*32*8000,1,0.16);
  3 +toc
  4 +
1 5 %compare_plots(c100s,c100d)
2 6 close all
3 7 %g100000d = importfile('g100000d.csv');
4 8 %g10000000s = importfile('g10000000s.csv');
5 9 %g1000000s = importfile('g10000000s.csv');
6   -[output,same,absError,relError,maxAbsError,maxRelError] = compareMatrices(g1000d,m10000db,3e-4);
  10 +%[output,same,absError,relError,maxAbsError,maxRelError] = compareMatrices(g1000d,m10000db,3e-4);
7 11
8 12 format compact
9 13 format short g
10   -maxRelError
11   -maxAbsError
  14 +
BIN  version1.6.jpg

0 comments on commit e718137

Please sign in to comment.
Something went wrong with that request. Please try again.