Skip to content
Browse files

Version 2

  • Loading branch information...
1 parent 267fa52 commit e718137e00cf1f7deafd31dfe7176451026e0038 @elatier committed Mar 27, 2012
View
0 ber_gpu/template_gold.cpp → ber_gpu/C_gold.cpp
File renamed without changes.
View
56 ber_gpu/template_kernel.cu → ber_gpu/gpu_kernel.cu
@@ -9,33 +9,38 @@
*
*/
-/* Template project which demonstrates the basics on how to setup a project
- * example application.
- * Device code.
- */
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <cuComplex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
#include <math_functions.h>
+#include <curand_kernel.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "device_functions.h"
-////////////////////////////////////////////////////////////////////////////////
-//! Simple test kernel for device functionality
-//! @param g_idata input data in global memory
-//! @param g_odata output data in global memory
-////////////////////////////////////////////////////////////////////////////////
-
-__device__ __forceinline__ int countBitsDev(int i)
+__device__ __forceinline__ unsigned int countBitsDev(unsigned int v)
{
- i = i - ((i >> 1) & 0x55555555);
- i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
- return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
+ unsigned int c;
+ for (c = 0; v; c++)
+ {
+ v &= v - 1; // clear the least significant bit set
+ }
+ return c;
}
+__device__ __forceinline__ int quantisizeDev(float x)
+{
+ int res=(int)floor(x)+2;
+ res=max(0,res);
+ return min(3,res);
+}
-template<class type>
-__device__ __forceinline__ int quantisizeDev(type x)
+__device__ __forceinline__ int quantisizeDev(double x)
{
int res=(int)floor(x)+2;
res=max(0,res);
@@ -61,12 +66,12 @@ kernelDouble(curandState * state, const unsigned int iter, const double awgn_sig
int id = threadIdx.x + blockIdx.x * blockDim.x;
/* Copy state to local memory for efficiency */
curandState ls = state[id];
+ unsigned int local_distance=0;
cuDoubleComplex signal, noise;
int input, output;
- distance[id]=0;
- for (int i=0; i<iter;i++) {
+ for (unsigned int i=0; i<iter;i++) {
input = curand(&ls) & 15; //same as %16
- signal = make_cuDoubleComplex((double)(input & 3)-1.5,(double)(input >> 2)-1.5);
+ signal = make_cuDoubleComplex((input & 3)-1.5,(input >> 2)-1.5);
sincos(curand_normal_double(&ls)*phase_sigma, &noise.y, &noise.x);
signal = cuCmul(signal,noise);
@@ -77,9 +82,10 @@ kernelDouble(curandState * state, const unsigned int iter, const double awgn_sig
signal = cuCadd(signal,noise);
output = quantisizeDev(signal.x) + quantisizeDev(signal.y)*4;
- distance[id] += bitTable[output ^ input];
+ local_distance += bitTable[output ^ input];
}
state[id] = ls;
+ distance[id] = local_distance;
}
__global__ void
@@ -88,12 +94,12 @@ kernelFloat(curandState * state, const unsigned int iter, const float awgn_sigma
int id = threadIdx.x + blockIdx.x * blockDim.x;
/* Copy state to local memory for efficiency */
curandState ls = state[id];
+ unsigned int local_distance=0;
cuComplex signal, noise;
int input, output;
- distance[id]=0;
- for (int i=0; i<iter;i++) {
+ for (unsigned int i=0; i<iter;i++) {
input = curand(&ls) & 15; //same as %16
- signal = make_cuComplex((float)(input & 3)-1.5,(float)(input >> 2)-1.5);
+ signal = make_cuComplex((input & 3)-1.5,(input >> 2)-1.5);
sincosf(curand_normal(&ls)*phase_sigma, &noise.y, &noise.x);
signal = cuCmulf(signal,noise);
@@ -104,9 +110,11 @@ kernelFloat(curandState * state, const unsigned int iter, const float awgn_sigma
signal = cuCaddf(signal,noise);
output = quantisizeDev(signal.x) + quantisizeDev(signal.y)*4;
- distance[id] += bitTable[output ^ input];
+ local_distance += bitTable[output ^ input];
}
+
state[id] = ls;
+ distance[id] = local_distance;
}
#endif // #ifndef _TEMPLATE_KERNEL_H_
View
45 ber_gpu/template.cu → ber_gpu/main_results.cu
@@ -32,7 +32,7 @@
#include <shrUtils.h>
// includes, kernels
-#include <template_kernel.cu>
+#include <gpu_kernel.cu>
extern "C"
double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
@@ -197,17 +197,6 @@ void gpuCompute(int argc, char** argv)
fclose(file);
sdkStopTimer( &timerTotal);
- printf("----------------\n");
- if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
- printf("----------------\n");
- printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
- printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
- printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
- printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
- printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
- //printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
-
-
/* Cleanup */
checkCudaErrors ( cudaFree ( devStates ));
checkCudaErrors ( cudaFree ( devResults ));
@@ -221,30 +210,6 @@ void gpuCompute(int argc, char** argv)
sdkDeleteTimer( &timerSetup );
}
-void seqCompute(int argc, char** argv) {
- StopWatchInterface *timer = 0;
- StopWatchInterface *timer2 = 0;
-
- sdkCreateTimer( &timer );
- sdkStartTimer( &timer );
- double resultDouble = computeGoldDouble(n*num_threads*num_blocks,(double)1,(double)0.16);
- sdkStopTimer( &timer );
-
- sdkCreateTimer( &timer2 );
- sdkStartTimer( &timer2 );
- double resultFloat = computeGoldFloat(n*num_threads*num_blocks,(float)1,(float)0.16);
- sdkStopTimer( &timer2 );
-
-
- printf("----------------\n");
- printf("CPU versions:\n");
- printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
- printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
-
- sdkDeleteTimer( &timer );
- sdkDeleteTimer( &timer2 );
-
-}
////////////////////////////////////////////////////////////////////////////////
// Program main
@@ -255,13 +220,11 @@ main( int argc, char** argv)
num_blocks = 16;
num_threads = 256;
kernelIter = 1;
-
- //runTest( argc, argv);
- //seqCompute(argc, argv);
+
if (argc >= 3) {
n = atoi(argv[2]);
- printf("Config: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i Total:%lu\n",num_blocks,num_threads,n,kernelIter,(unsigned long long) num_blocks*num_threads*n*kernelIter);
+ printf("Config: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i Total N:%lu\n",num_blocks,num_threads,n,kernelIter,(unsigned long long) num_blocks*num_threads*n*kernelIter);
gpuCompute(argc, argv);
}
- //std::cin(0);
+
}
View
211 ber_gpu/main_timing.cu
@@ -0,0 +1,211 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Template project which demonstrates the basics on how to setup a project
+* example application.
+* Host code.
+*/
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <iostream>
+
+// includes CUDA
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <cuda_runtime.h>
+
+// includes, project
+#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples
+#include <shrQATest.h> // This is for automated testing output (--qatest)
+#include <shrUtils.h>
+
+// includes, kernels
+#include <gpu_kernel.cu>
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+extern "C"
+double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
+extern "C"
+double computeGoldFloat( const int n, const float awgn_sigma, const float phase_sigma);
+
+
+int num_blocks = 32;
+int num_threads = 256;
+unsigned int n = 80000;
+int kernelIter = 1;
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line )
+{
+ if(cudaSuccess != err)
+ {
+ fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
+ exit(-1);
+ }
+}
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )
+{
+ cudaError_t err = cudaGetLastError();
+ if (cudaSuccess != err)
+ {
+ fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+ file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
+ exit(-1);
+ }
+}
+
+void gpuCompute(int chooseKernel)
+{
+ unsigned int i;
+ unsigned long long total = 0;
+ curandState *devStates;
+ unsigned int * devResults , *hostResults, *devBitTable;
+ double awgn_sigma = 1;
+ double phase_sigma = 0.16;
+
+ StopWatchInterface *timerKernel = 0;
+ StopWatchInterface *timerMemory = 0;
+ StopWatchInterface *timerAdd = 0;
+ StopWatchInterface *timerTotal = 0;
+ StopWatchInterface *timerSetup = 0;
+ sdkCreateTimer( &timerKernel );
+ sdkCreateTimer( &timerMemory);
+ sdkCreateTimer( &timerAdd );
+ sdkCreateTimer( &timerTotal );
+ sdkCreateTimer( &timerSetup );
+
+ cudaDeviceReset(); //need to get constant times between runs
+ sdkStartTimer( &timerTotal );
+
+ sdkStartTimer( &timerSetup );
+ /* Allocate space for results on device */
+ checkCudaErrors( cudaMalloc (( void **)&devResults , num_blocks * num_threads * sizeof (unsigned int)));
+
+ /* Allocate space for prng states on device */
+ checkCudaErrors ( cudaMalloc (( void **)&devStates , num_blocks * num_threads * sizeof ( curandState ))); //48 bytes
+ /* Allocate space for lookup table on device */
+ checkCudaErrors ( cudaMalloc (( void **)&devBitTable , 16*sizeof(unsigned int)));
+
+ /* Setup prng states */
+ setup_kernel<<<num_blocks, num_threads>>>( devStates, devBitTable );
+ /* Allocate space for results on host */
+ hostResults = (unsigned int *) calloc (num_blocks * num_threads, sizeof (unsigned int));
+
+
+ //sync for correct timing
+ checkCudaErrors( cudaDeviceSynchronize() );
+
+ sdkStopTimer( &timerSetup );
+
+ sdkStartTimer( &timerKernel );
+ for (int j=0; j<kernelIter; j++) {
+ /* Copy device memory to host */
+ if(chooseKernel) {
+ kernelDouble<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
+ }
+ else {
+ kernelFloat<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
+ }
+ checkCudaErrors( cudaDeviceSynchronize() );
+
+ }
+ sdkStopTimer( &timerKernel);
+
+ sdkStartTimer( &timerMemory );
+ checkCudaErrors ( cudaMemcpy ( hostResults , devResults , num_blocks*num_threads*sizeof(unsigned int), cudaMemcpyDeviceToHost ));
+ checkCudaErrors( cudaDeviceSynchronize() );
+ /* Show result */
+ sdkStopTimer( &timerMemory);
+ sdkStartTimer( &timerAdd );
+ for(i = 0; i < num_blocks * num_threads; i++) {
+ total += hostResults [i];
+ }
+ double result = ( double ) total / (num_blocks * num_threads * n * kernelIter );
+
+ sdkStopTimer( &timerAdd);
+ sdkStopTimer( &timerTotal);
+
+ printf("----------------\n");
+ if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
+ printf("----------------\n");
+ printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
+ printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
+ printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
+ printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
+ printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
+ printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
+
+
+ /* Cleanup */
+ checkCudaErrors ( cudaFree ( devStates ));
+ checkCudaErrors ( cudaFree ( devResults ));
+ checkCudaErrors ( cudaFree ( devBitTable ));
+ free ( hostResults );
+
+ sdkDeleteTimer( &timerKernel );
+ sdkDeleteTimer( &timerMemory);
+ sdkDeleteTimer( &timerAdd );
+ sdkDeleteTimer( &timerTotal );
+ sdkDeleteTimer( &timerSetup );
+}
+
+void seqCompute(int argc, char** argv) {
+ StopWatchInterface *timer = 0;
+ StopWatchInterface *timer2 = 0;
+
+ sdkCreateTimer( &timer );
+ sdkStartTimer( &timer );
+ double resultDouble = computeGoldDouble(n*num_threads*num_blocks,1,0.16);
+ sdkStopTimer( &timer );
+
+ sdkCreateTimer( &timer2 );
+ sdkStartTimer( &timer2 );
+ double resultFloat = computeGoldFloat(n*num_threads*num_blocks,1,0.16);
+ sdkStopTimer( &timer2 );
+
+
+ printf("----------------\n");
+ printf("CPU versions:\n");
+ printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
+ printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
+
+ sdkDeleteTimer( &timer );
+ sdkDeleteTimer( &timer2 );
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main( int argc, char** argv)
+{
+ printf("Config: Blocks:%i Threads/Block:%i, n:%i, Actual N:%lu\n",num_blocks,num_threads,n,(unsigned long long) num_blocks*num_threads*n*kernelIter);
+ seqCompute(argc, argv);
+ gpuCompute(0);
+ gpuCompute(1);
+
+}
View
459 ber_gpu/template_time.cu
@@ -1,459 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/* Template project which demonstrates the basics on how to setup a project
-* example application.
-* Host code.
-*/
-
-// includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <iostream>
-
-// includes CUDA
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <cuda_runtime.h>
-
-// includes, project
-#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples
-#include <shrQATest.h> // This is for automated testing output (--qatest)
-#include <shrUtils.h>
-
-// includes, kernels
-#include <template_kernel.cu>
-
-////////////////////////////////////////////////////////////////////////////////
-// declaration, forward
-void runTest( int argc, char** argv);
-
-extern "C"
-double computeGoldDouble( const int n, const double awgn_sigma, const double phase_sigma);
-extern "C"
-double computeGoldFloat( const int n, const float awgn_sigma, const float phase_sigma);
-
-
-int num_blocks = 16;
-int num_threads = 384;
-unsigned int n = 53332*2;
-int kernelIter = 1;
-////////////////////////////////////////////////////////////////////////////////
-// These are CUDA Helper functions
-
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line )
-{
- if(cudaSuccess != err)
- {
- fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
- exit(-1);
- }
-}
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )
-{
- cudaError_t err = cudaGetLastError();
- if (cudaSuccess != err)
- {
- fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
- file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
- exit(-1);
- }
-}
-
-// General GPU Device CUDA Initialization
-int gpuDeviceInit(int devID)
-{
- int deviceCount;
- checkCudaErrors(cudaGetDeviceCount(&deviceCount));
-
- if (deviceCount == 0)
- {
- fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
- exit(-1);
- }
-
- if (devID < 0)
- devID = 0;
-
- if (devID > deviceCount-1)
- {
- fprintf(stderr, "\n");
- fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
- fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
- fprintf(stderr, "\n");
- return -devID;
- }
-
- cudaDeviceProp deviceProp;
- checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
-
- if (deviceProp.major < 1)
- {
- fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
- exit(-1);
- }
-
- checkCudaErrors( cudaSetDevice(devID) );
- printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
-
- return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-int gpuGetMaxGflopsDeviceId()
-{
- int current_device = 0, sm_per_multiproc = 0;
- int max_compute_perf = 0, max_perf_device = 0;
- int device_count = 0, best_SM_arch = 0;
- cudaDeviceProp deviceProp;
- cudaGetDeviceCount( &device_count );
-
- // Find the best major SM Architecture GPU device
- while (current_device < device_count)
- {
- cudaGetDeviceProperties( &deviceProp, current_device );
- if (deviceProp.major > 0 && deviceProp.major < 9999)
- {
- best_SM_arch = MAX(best_SM_arch, deviceProp.major);
- }
- current_device++;
- }
-
- // Find the best CUDA capable GPU device
- current_device = 0;
- while( current_device < device_count )
- {
- cudaGetDeviceProperties( &deviceProp, current_device );
- if (deviceProp.major == 9999 && deviceProp.minor == 9999)
- {
- sm_per_multiproc = 1;
- }
- else
- {
- sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
- }
-
- int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
-
- if( compute_perf > max_compute_perf )
- {
- // If we find GPU with SM major > 2, search only these
- if ( best_SM_arch > 2 )
- {
- // If our device==dest_SM_arch, choose this, or else pass
- if (deviceProp.major == best_SM_arch)
- {
- max_compute_perf = compute_perf;
- max_perf_device = current_device;
- }
- }
- else
- {
- max_compute_perf = compute_perf;
- max_perf_device = current_device;
- }
- }
- ++current_device;
- }
- return max_perf_device;
-}
-
-
-// Initialization code to find the best CUDA Device
-int findCudaDevice(int argc, const char **argv)
-{
- cudaDeviceProp deviceProp;
- int devID = 0;
- // If the command-line has a device number specified, use it
- if (checkCmdLineFlag(argc, argv, "device"))
- {
- devID = getCmdLineArgumentInt(argc, argv, "device=");
- if (devID < 0)
- {
- printf("Invalid command line parameter\n ");
- exit(-1);
- }
- else
- {
- devID = gpuDeviceInit(devID);
- if (devID < 0)
- {
- printf("exiting...\n");
- shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
- exit(-1);
- }
- }
- }
- else
- {
- // Otherwise pick the device with highest Gflops/s
- devID = gpuGetMaxGflopsDeviceId();
- checkCudaErrors( cudaSetDevice( devID ) );
- checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
- printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
- }
- return devID;
-}
-// end of CUDA Helper Functions
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-//! Run a simple test for CUDA
-////////////////////////////////////////////////////////////////////////////////
-void
-runTest( int argc, char** argv)
-{
- bool bTestResult = true;
-
- shrQAStart(argc, argv);
-
- // use command-line specified CUDA device, otherwise use device with highest Gflops/s
- int devID = findCudaDevice(argc, (const char**)argv);
-
- StopWatchInterface *timer = 0;
- sdkCreateTimer( &timer );
- sdkStartTimer( &timer );
-
- unsigned int num_threads = 32;
- unsigned int mem_size = sizeof( float) * num_threads;
-
- // allocate host memory
- float* h_idata = (float*) malloc( mem_size);
- // initalize the memory
- for( unsigned int i = 0; i < num_threads; ++i)
- {
- h_idata[i] = (float) i;
- }
-
- // allocate device memory
- float* d_idata;
- checkCudaErrors( cudaMalloc( (void**) &d_idata, mem_size) );
- // copy host memory to device
- checkCudaErrors( cudaMemcpy( d_idata, h_idata, mem_size,
- cudaMemcpyHostToDevice) );
-
- // allocate device memory for result
- float* d_odata;
- checkCudaErrors( cudaMalloc( (void**) &d_odata, mem_size));
-
- // setup execution parameters
- dim3 grid( 1, 1, 1);
- dim3 threads( num_threads, 1, 1);
-
- // execute the kernel
-// testKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);
-
- // check if kernel execution generated and error
- getLastCudaError("Kernel execution failed");
-
- // allocate mem for the result on host side
- float* h_odata = (float*) malloc( mem_size);
- // copy result from device to host
- checkCudaErrors( cudaMemcpy( h_odata, d_odata, sizeof( float) * num_threads,
- cudaMemcpyDeviceToHost) );
-
- sdkStopTimer( &timer );
- printf( "Processing time: %f (ms)\n", sdkGetTimerValue( &timer ) );
- sdkDeleteTimer( &timer );
-
- // compute reference solution
- float* reference = (float*) malloc( mem_size);
- //computeGold( reference, h_idata, num_threads);
-
- // check result
- if( checkCmdLineFlag( argc, (const char**) argv, "regression") )
- {
- // write file for regression test
- sdkWriteFile( "./data/regression.dat", h_odata, num_threads, 0.0f, false );
- }
- else
- {
- // custom output handling when no regression test running
- // in this case check if the result is equivalent to the expected soluion
- bTestResult = compareData( reference, h_odata, num_threads, 0.0f, 0.0f );
- }
- // cleanup memory
- free( h_idata );
- free( h_odata );
- free( reference );
- checkCudaErrors(cudaFree(d_idata));
- checkCudaErrors(cudaFree(d_odata));
-
- cudaDeviceReset();
- shrQAFinishExit(argc, (const char **)argv, (bTestResult ? QA_PASSED : QA_FAILED) );
-}
-
-void gpuCompute(int chooseKernel)
-{
-
-
-
- unsigned int i;
- unsigned long long total = 0;
- curandState *devStates;
- unsigned int * devResults , *hostResults, *devBitTable;
- double awgn_sigma = 1;
- //double awgn_sigma = pow(2.0,-7);
- double phase_sigma = 0.16;
- //double phase_sigma = 0.04;
-
- StopWatchInterface *timerKernel = 0;
- StopWatchInterface *timerMemory = 0;
- StopWatchInterface *timerAdd = 0;
- StopWatchInterface *timerTotal = 0;
- StopWatchInterface *timerSetup = 0;
- sdkCreateTimer( &timerKernel );
- sdkCreateTimer( &timerMemory);
- sdkCreateTimer( &timerAdd );
- sdkCreateTimer( &timerTotal );
- sdkCreateTimer( &timerSetup );
-
- cudaDeviceReset(); //need to get constant times between runs
- sdkStartTimer( &timerTotal );
-
- sdkStartTimer( &timerSetup );
- /* Allocate space for results on device */
- checkCudaErrors( cudaMalloc (( void **)&devResults , num_blocks * num_threads * sizeof (unsigned int)));
-
- /* Allocate space for prng states on device */
- checkCudaErrors ( cudaMalloc (( void **)&devStates , num_blocks * num_threads * sizeof ( curandState ))); //48 bytes
- /* Allocate space for lookup table on device */
- checkCudaErrors ( cudaMalloc (( void **)&devBitTable , 16*sizeof(unsigned int)));
-
- /* Setup prng states */
- setup_kernel<<<num_blocks, num_threads>>>( devStates, devBitTable );
- /* Allocate space for results on host */
- hostResults = (unsigned int *) calloc (num_blocks * num_threads, sizeof (unsigned int));
-
-
- //initialise device result memory to 0
- checkCudaErrors( cudaMemset ( devResults , 0, num_blocks * num_threads * sizeof (unsigned int)));
- //sync for correct timing
- checkCudaErrors( cudaDeviceSynchronize() );
-
- sdkStopTimer( &timerSetup );
-
- sdkStartTimer( &timerKernel );
- for (int j=0; j<kernelIter; j++) {
- /* Copy device memory to host */
- if(chooseKernel) {
- kernelDouble<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
- }
- else {
- kernelFloat<<<num_blocks, num_threads>>>(devStates, n/4,awgn_sigma,phase_sigma, devResults, devBitTable );
- }
- checkCudaErrors( cudaDeviceSynchronize() );
-
- }
- sdkStopTimer( &timerKernel);
-
- sdkStartTimer( &timerMemory );
- checkCudaErrors ( cudaMemcpy ( hostResults , devResults , num_blocks*num_threads*sizeof(unsigned int), cudaMemcpyDeviceToHost ));
- checkCudaErrors( cudaDeviceSynchronize() );
- /* Show result */
- sdkStopTimer( &timerMemory);
- sdkStartTimer( &timerAdd );
- for(i = 0; i < num_blocks * num_threads; i++) {
- total += hostResults [i];
- }
- double result = ( double ) total / (num_blocks * num_threads * n * kernelIter );
-
- sdkStopTimer( &timerAdd);
- sdkStopTimer( &timerTotal);
-
- printf("----------------\n");
- if (chooseKernel) printf("GPU Double version:\n"); else printf("GPU Float version:\n");
- printf("----------------\n");
- printf( "Total time: %f (ms)\n", sdkGetTimerValue( &timerTotal ) );
- printf( "Setup time: %f (ms)\n", sdkGetTimerValue( &timerSetup ) );
- printf( "Kernel time: %f (ms)\n", sdkGetTimerValue( &timerKernel ) );
- printf( "Result copy to host time: %f (ms)\n", sdkGetTimerValue( &timerMemory ) );
- printf( "Result add time: %f (ms)\n", sdkGetTimerValue( &timerAdd ) );
- printf( "The error bits fraction: %10.13lg (%lg)\n", result,result);
-
-
- /* Cleanup */
- checkCudaErrors ( cudaFree ( devStates ));
- checkCudaErrors ( cudaFree ( devResults ));
- checkCudaErrors ( cudaFree ( devBitTable ));
- free ( hostResults );
-
- sdkDeleteTimer( &timerKernel );
- sdkDeleteTimer( &timerMemory);
- sdkDeleteTimer( &timerAdd );
- sdkDeleteTimer( &timerTotal );
- sdkDeleteTimer( &timerSetup );
-}
-
-void seqCompute(int argc, char** argv) {
- StopWatchInterface *timer = 0;
- StopWatchInterface *timer2 = 0;
- /*
- sdkCreateTimer( &timer );
- sdkStartTimer( &timer );
-
- double[81] phase_sigma;
- for (int k=0; k<81; k++) {
- phase_sigma[k] =
- }
- for j=1:length(phase_sigma)
- for i=1:length(awgn_sigma)
- error_rate(i,j)=ber_test(n,awgn_sigma(i),phase_sigma(j));
- end
- */
-
- //awgn_sigma=2^(-8:0.1:0);
- //phase_sigma=[0.01,0.04,0.08,0.16];
- sdkCreateTimer( &timer );
- sdkStartTimer( &timer );
- double resultDouble = computeGoldDouble(n*num_threads*num_blocks,1,0.16);
- sdkStopTimer( &timer );
-
- sdkCreateTimer( &timer2 );
- sdkStartTimer( &timer2 );
- double resultFloat = computeGoldFloat(n*num_threads*num_blocks,1,0.16);
- sdkStopTimer( &timer2 );
-
-
- printf("----------------\n");
- printf("CPU versions:\n");
- printf( "Processing total time (Double,Float): %.2lf, %.2lf (ms)\n", sdkGetTimerValue( &timer ),sdkGetTimerValue( &timer2 ) );
- printf("The error bits fraction (Double,Float): %10.13lg, %10.13lg\n",resultDouble,resultFloat);
-
- sdkDeleteTimer( &timer );
- sdkDeleteTimer( &timer2 );
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int
-main( int argc, char** argv)
-{
- shrQAStart(argc, argv);
- printf("GPU configuration: Blocks:%i Threads/Block:%i, n:%i, kernelIter:%i \n",num_blocks,num_threads,n,kernelIter);
- //runTest( argc, argv);
- //seqCompute(argc, argv);
- gpuCompute(0);
- gpuCompute(1);
- //std::cin(0);
-}
View
14 ber_gpu/template_vs2010.vcxproj
@@ -197,22 +197,16 @@
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
- <CudaCompile Include="template.cu" />
- <CudaCompile Include="template_time.cu">
- <Include Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">./;../../common/inc;../../../shared/inc</Include>
- <Include Condition="'$(Configuration)|$(Platform)'=='Release|x64'">./;../../common/inc;../../../shared/inc</Include>
- <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10;compute_20,sm_20</CodeGeneration>
- <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10;compute_20,sm_20</CodeGeneration>
- <TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">64</TargetMachinePlatform>
- <TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Release|x64'">64</TargetMachinePlatform>
+ <CudaCompile Include="main_results.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</CudaCompile>
- <CudaCompile Include="template_kernel.cu">
+ <CudaCompile Include="gpu_kernel.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</CudaCompile>
+ <CudaCompile Include="main_timing.cu" />
</ItemGroup>
<ItemGroup>
- <ClCompile Include="template_gold.cpp" />
+ <ClCompile Include="C_gold.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
View
BIN ber_gpu/vc100.pdb
Binary file not shown.
View
9 ber_matlab/script.m
@@ -1,11 +1,14 @@
+tic;
+ber_test(256*32*8000,1,0.16);
+toc
+
%compare_plots(c100s,c100d)
close all
%g100000d = importfile('g100000d.csv');
%g10000000s = importfile('g10000000s.csv');
%g1000000s = importfile('g10000000s.csv');
-[output,same,absError,relError,maxAbsError,maxRelError] = compareMatrices(g1000d,m10000db,3e-4);
+%[output,same,absError,relError,maxAbsError,maxRelError] = compareMatrices(g1000d,m10000db,3e-4);
format compact
format short g
-maxRelError
-maxAbsError
+
View
BIN version1.6.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit e718137

Please sign in to comment.
Something went wrong with that request. Please try again.