## CEPARCO CUDA Project Group 1

In [136]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin:/usr/local/cuda/bin:/usr/local/cuda/bin


# C Program

In [137]:
%%writefile C_asum.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h> //fabsf and cos/sin functions

void asum(size_t n, double* a, double* res) {
  *res = 0.0;
  for (int i=0; i<n;i++)
     *res += fabs(a[i]);
}

int main(int argc, char** argv){
   const size_t N = 28;
   const size_t ARRAY_SIZE = 1<<N;
   const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
   const size_t loope = 10;

   double *a, *res;
   a = (double*)malloc(ARRAY_BYTES);
   res = (double*)malloc(sizeof(double));

   clock_t start, end;

   for (int i = 0; i < ARRAY_SIZE; i++) {
    a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
   }

   *res = 0.0;

   asum(ARRAY_SIZE,a,res);

   double elapse, time_taken;
   elapse = 0.0f;
   for (int i=0; i<loope; i++){
    start = clock();
     asum(ARRAY_SIZE,a,res);
    end = clock();
    time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
    elapse = elapse + time_taken;
   }

    printf("Function (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", loope, elapse/loope, ARRAY_SIZE);
    printf("Absolute sum of vector size 2^%lu: %lf \n",N,*res);


  double err_res = 0.0;
   for (int i=0; i<ARRAY_SIZE; i++)
        err_res += fabs(a[i]);
   if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
   else
        printf("No errors encountered");

    free(a);
    return 0;
}

Overwriting C_asum.c


In [138]:
%%bash
gcc C_asum.c -lm -o C_asum

In [139]:
%%bash
./C_asum

Function (in C) average time for 10 loops is 1075.339200 milliseconds to execute an array size 268435456 
Absolute sum of vector size 2^28: 108762865473.985641 
No errors encountered

# Grid Stride; no prefetch, no page creation, no mem advise

In [140]:
%%writefile CUDA_asum1.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h> //fabsf and cos/sin functions

__global__
void asum(size_t n, double* a, double *res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        atomicAdd(res,fabs(a[i])); //proper summation function for CUDA
}

int main(){
   const size_t N = 28;
   const size_t ARRAY_SIZE = 1<<N;
   const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
   const size_t loope = 10;

   double *a, *res;
   cudaMallocManaged(&a, ARRAY_BYTES);
   cudaMallocManaged(&res, sizeof(double));

   for (int i = 0; i < ARRAY_SIZE; i++) {
    a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
   }

   *res = 0.0;

  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = Double asum\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++){
    *res = 0.0;
    asum <<<numBlocks, numThreads>>> (ARRAY_SIZE,a,res);
    cudaDeviceSynchronize();
  }

  printf("Absolute sum of vector size 2^%lu: %lf \n",N,*res);

  //Since summation is performed by each kernel, there is no "element by element" error checking
  double err_res = 0.0;
   for (int i=0; i<ARRAY_SIZE; i++)
        err_res += fabs(a[i]);
   if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
   else
        printf("No errors encountered");

  cudaFree(a);
  cudaFree(res);
  return 0;
}
//In compiling, -arch=sm_60 allows CUDA 6.0 for use of atomicAdd with double* and double parameters

Overwriting CUDA_asum1.cu


In [141]:
%%bash
nvcc CUDA_asum1.cu -lm -o CUDA_asum1 -Wno-deprecated-gpu-targets -arch=sm_60

In [142]:
%%bash
nvprof ./CUDA_asum1

==994913== NVPROF is profiling process 994913, command: ./CUDA_asum1


*** function = Double asum
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Absolute sum of vector size 2^28: 108762865473.984207 
No errors encountered

==994913== Profiling application: ./CUDA_asum1
==994913== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.77430s        10  677.43ms  614.18ms  1.24349s  asum(unsigned long, double*, double*)
      API calls:   91.86%  6.77468s        10  677.47ms  614.21ms  1.24352s  cudaDeviceSynchronize
                    7.02%  518.04ms         2  259.02ms  215.37us  517.82ms  cudaMallocManaged
                    1.10%  80.889ms         2  40.444ms  364.88us  80.524ms  cudaFree
                    0.02%  1.4070ms        10  140.70us  87.799us  498.50us  cudaLaunchKernel
                    0.00%  237.07us       114  2.0790us     115ns  98.651us  cuDeviceGetAttribute
                    0.00%  46.300us         1  46.300us  46.300us  46.300us  cuDeviceGetName
                    0.00%  18.234us         1  18.234us  18.234us  18.234us  cuDeviceTotalMem
                    0.00%  3.4690us         1  3.4690us  3.4690us  3

# Grid Stride; with prefetch, no page creation, no mem advise

In [143]:
%%writefile CUDA_asum2.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

__global__
void asum(size_t n, double* a, double *res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        atomicAdd(res,fabs(a[i]));
}


int main(){
   const size_t N = 28;
   const size_t ARRAY_SIZE = 1<<N;
   const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
   const size_t loope = 10;

   double *a, *res;
   cudaMallocManaged(&a, ARRAY_BYTES);
   cudaMallocManaged(&res, sizeof(double));

  int device = -1;
  cudaGetDevice(&device);

   for (int i = 0; i < ARRAY_SIZE; i++) {
    a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
   }

   *res = 0.0;

  cudaMemPrefetchAsync(a,ARRAY_BYTES,device,NULL);

  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = Double asum\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++){
    *res = 0.0;
    asum <<<numBlocks, numThreads>>> (ARRAY_SIZE,a,res);
    cudaDeviceSynchronize();
  }

//"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(a,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(res,sizeof(double),cudaCpuDeviceId,NULL);

  printf("Absolute sum of vector size 2^%lu: %lf \n",N,*res);

  double err_res = 0.0;
   for (int i=0; i<ARRAY_SIZE; i++)
        err_res += fabs(a[i]);
   if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
   else
        printf("No errors encountered");

  cudaFree(a);
  cudaFree(res);
  return 0;
}

Overwriting CUDA_asum2.cu


In [144]:
%%bash
nvcc CUDA_asum2.cu -lm -o CUDA_asum2 -Wno-deprecated-gpu-targets -arch=sm_60

In [145]:
%%bash
nvprof ./CUDA_asum2

==994972== NVPROF is profiling process 994972, command: ./CUDA_asum2


*** function = Double asum
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Absolute sum of vector size 2^28: 108762865473.984070 
No errors encountered

==994972== Profiling application: ./CUDA_asum2
==994972== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.14741s        10  614.74ms  614.28ms  617.26ms  asum(unsigned long, double*, double*)
      API calls:   83.53%  6.14767s        10  614.77ms  614.30ms  617.28ms  cudaDeviceSynchronize
                    7.22%  531.54ms         2  265.77ms  432.24us  531.11ms  cudaMallocManaged
                    5.93%  436.62ms         3  145.54ms  143.09us  361.12ms  cudaMemPrefetchAsync
                    1.68%  123.57ms         2  61.784ms  2.6801ms  120.89ms  cudaFree
                    1.63%  119.92ms        10  11.992ms  71.172us  118.89ms  cudaLaunchKernel
                    0.00%  233.15us       114  2.0450us     112ns  98.084us  cuDeviceGetAttribute
                    0.00%  51.044us         1  51.044us  51.044us  51.044us  cudaGetDevice
                    0.00%  35.760us         1  35.760us  35.760us 

# Grid Stride; with prefetch; with page creation; no mem advise

In [146]:
%%writefile CUDA_asum3.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

__global__
void asum(size_t n, double* a, double *res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        atomicAdd(res,fabs(a[i]));
}


int main(){
   const size_t N = 28;
   const size_t ARRAY_SIZE = 1<<N;
   const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
   const size_t loope = 10;

   double *a, *res;
   cudaMallocManaged(&a, ARRAY_BYTES);
   cudaMallocManaged(&res, sizeof(double));

  int device = -1;
  cudaGetDevice(&device);

  cudaMemPrefetchAsync(a,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(res,sizeof(double),device,NULL);

   for (int i = 0; i < ARRAY_SIZE; i++) {
    a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
   }

   *res = 0.0;

  cudaMemPrefetchAsync(a,ARRAY_BYTES,device,NULL);

  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = Double asum\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++){
    *res = 0.0;
    asum <<<numBlocks, numThreads>>> (ARRAY_SIZE,a,res);
    cudaDeviceSynchronize();
  }

//"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(a,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(res,sizeof(double),cudaCpuDeviceId,NULL);

  printf("Absolute sum of vector size 2^%lu: %lf \n",N,*res);

  double err_res = 0.0;
   for (int i=0; i<ARRAY_SIZE; i++)
        err_res += fabs(a[i]);
   if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
   else
        printf("No errors encountered");

  cudaFree(a);
  cudaFree(res);
  return 0;
}

Overwriting CUDA_asum3.cu


In [147]:
%%bash
nvcc CUDA_asum3.cu -lm -o CUDA_asum3 -Wno-deprecated-gpu-targets -arch=sm_60

In [148]:
%%bash
nvprof ./CUDA_asum3

==995030== NVPROF is profiling process 995030, command: ./CUDA_asum3


*** function = Double asum
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Absolute sum of vector size 2^28: 108762865473.983566 
No errors encountered

==995030== Profiling application: ./CUDA_asum3
==995030== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.14550s        10  614.55ms  614.42ms  614.73ms  asum(unsigned long, double*, double*)
      API calls:   77.62%  6.14508s        10  614.51ms  614.38ms  614.67ms  cudaDeviceSynchronize
                   14.50%  1.14826s         5  229.65ms  142.99us  761.50ms  cudaMemPrefetchAsync
                    5.46%  432.61ms         2  216.30ms  362.51us  432.24ms  cudaMallocManaged
                    1.64%  129.92ms        10  12.992ms  128.76us  128.49ms  cudaLaunchKernel
                    0.77%  60.790ms         2  30.395ms  317.65us  60.472ms  cudaFree
                    0.00%  361.51us       114  3.1710us     205ns  142.24us  cuDeviceGetAttribute
                    0.00%  52.134us         1  52.134us  52.134us  52.134us  cuDeviceGetName
                    0.00%  14.554us         1  14.554us  14.554u

# Grid Stride; with prefetch; with page creation; with mem advise

In [164]:
%%writefile CUDA_asum4.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

__global__
void asum(size_t n, double* a, double *res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        atomicAdd(res,fabs(a[i]));
}


int main(){
   const size_t N = 28;
   const size_t ARRAY_SIZE = 1<<N;
   const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
   const size_t loope = 10;

   double *a, *res;
   cudaMallocManaged(&a, ARRAY_BYTES);
   cudaMallocManaged(&res, sizeof(double));

  int device = -1;
  cudaGetDevice(&device);


// memory advise
   cudaMemAdvise(a, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
   cudaMemAdvise(a, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

//"prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(a,ARRAY_BYTES,cudaCpuDeviceId,NULL);
//"prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(res,sizeof(double),device,NULL);

// ****init array
   for (int i = 0; i < ARRAY_SIZE; i++) {
    a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
   }

   *res = 0.0;

 //"Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(a,ARRAY_BYTES,device,NULL);

// setup CUDA kernel
  size_t numThreads = 1024;
  size_t numBlocks = (ARRAY_SIZE + numThreads-1) / numThreads;

  printf("*** function = Double asum\n");
  printf("numElements = %lu\n", ARRAY_SIZE);
  printf("numBlocks = %lu, numThreads = %lu \n",numBlocks,numThreads);
  for (size_t i=0; i<loope;i++){
    *res = 0.0;
    asum <<<numBlocks, numThreads>>> (ARRAY_SIZE,a,res);
    cudaDeviceSynchronize();
  }

//"Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(a,ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(res,sizeof(double),cudaCpuDeviceId,NULL);

  printf("Absolute sum of vector size 2^%lu: %lf \n",N,*res);

  double err_res = 0.0;
   for (int i=0; i<ARRAY_SIZE; i++)
        err_res += fabs(a[i]);
   if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
   else
        printf("No errors encountered");

  cudaFree(a);
  cudaFree(res);
  return 0;
}

Overwriting CUDA_asum4.cu


In [165]:
%%bash
nvcc CUDA_asum4.cu -lm -o CUDA_asum4 -Wno-deprecated-gpu-targets -arch=sm_60

In [166]:
%%bash
nvprof ./CUDA_asum4

==995509== NVPROF is profiling process 995509, command: ./CUDA_asum4


*** function = Double asum
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Absolute sum of vector size 2^28: 108762865473.985260 
No errors encountered

==995509== Profiling application: ./CUDA_asum4
==995509== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.14846s        10  614.85ms  614.54ms  616.32ms  asum(unsigned long, double*, double*)
      API calls:   78.99%  6.14887s        10  614.89ms  614.56ms  616.35ms  cudaDeviceSynchronize
                   14.75%  1.14843s         5  229.69ms  79.875us  910.78ms  cudaMemPrefetchAsync
                    5.09%  396.24ms         2  198.12ms  160.53us  396.08ms  cudaMallocManaged
                    1.14%  88.835ms         2  44.417ms  217.70us  88.617ms  cudaFree
                    0.02%  1.3105ms        10  131.05us  73.616us  486.54us  cudaLaunchKernel
                    0.00%  182.42us       114  1.6000us     109ns  77.594us  cuDeviceGetAttribute
                    0.00%  82.160us         2  41.080us  10.896us  71.264us  cudaMemAdvise
                    0.00%  20.925us         1  20.925us  20.925us 

# Classic MemCopy (no Unified Memory)

In [152]:
%%writefile CUDA_asum5.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

__global__
void asum(size_t n, double* a, double *res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    double localSum = 0.0;

    // Each thread sums part of the array
    for (int i = index; i < n; i += stride)
        localSum += fabs(a[i]);

    // Atomic add partial results to global sum
    atomicAdd(res, localSum);
}

int main() {
    const size_t N = 28;
    const size_t ARRAY_SIZE = 1 << N;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
    const size_t loope = 10;

    // Allocate host memory (regular malloc)
    double *h_a = (double*)malloc(ARRAY_BYTES);
    double h_res = 0.0;

    // Initialize host data
    for (size_t i = 0; i < ARRAY_SIZE; i++)
        h_a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;

    // Allocate device memory (no Unified Memory)
    double *d_a, *d_res;
    cudaMalloc(&d_a, ARRAY_BYTES);
    cudaMalloc(&d_res, sizeof(double));

    // Copy data to device
    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_res, &h_res, sizeof(double), cudaMemcpyHostToDevice);

    // Kernel setup
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;

    printf("*** function = Double asum (Classic MemCopy)\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu\n", numBlocks, numThreads);

    // Run kernel multiple times
    double zero = 0.0;
    for (size_t i = 0; i < loope; i++) {
        cudaMemcpy(d_res, &zero, sizeof(double), cudaMemcpyHostToDevice);
        asum<<<numBlocks, numThreads>>>(ARRAY_SIZE, d_a, d_res);
        cudaDeviceSynchronize();
    }

    // Copy result back to host
    cudaMemcpy(&h_res, d_res, sizeof(double), cudaMemcpyDeviceToHost);

    printf("Absolute sum of vector size 2^%lu: %lf\n", N, h_res);

    // Validate result
    double err_res = 0.0;
    for (size_t i = 0; i < ARRAY_SIZE; i++)
        err_res += fabs(h_a[i]);

    if (fabs(err_res - h_res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", h_res, err_res, (err_res - h_res));
    else
        printf("No errors encountered\n");

    // Free memory
    cudaFree(d_a);
    cudaFree(d_res);
    free(h_a);

    return 0;
}

Overwriting CUDA_asum5.cu


In [153]:
%%bash
nvcc CUDA_asum5.cu -lm -o CUDA_asum5 -Wno-deprecated-gpu-targets -arch=sm_60

In [154]:
%%bash
nvprof ./CUDA_asum5

==995143== NVPROF is profiling process 995143, command: ./CUDA_asum5


*** function = Double asum (Classic MemCopy)
numElements = 268435456
numBlocks = 262144, numThreads = 1024
Absolute sum of vector size 2^28: 108762865473.983994
No errors encountered


==995143== Profiling application: ./CUDA_asum5
==995143== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   87.29%  6.13784s        10  613.78ms  613.78ms  613.79ms  asum(unsigned long, double*, double*)
                   12.71%  893.55ms        12  74.463ms     640ns  893.54ms  [CUDA memcpy HtoD]
                    0.00%  3.0720us         1  3.0720us  3.0720us  3.0720us  [CUDA memcpy DtoH]
      API calls:   82.86%  6.13814s        10  613.81ms  613.80ms  613.84ms  cudaDeviceSynchronize
                   12.08%  894.85ms        13  68.835ms  10.763us  893.93ms  cudaMemcpy
                    5.01%  371.38ms         2  185.69ms  125.90us  371.25ms  cudaMalloc
                    0.03%  2.0462ms         2  1.0231ms  112.14us  1.9341ms  cudaFree
                    0.02%  1.2104ms        10  121.04us  32.102us  542.35us  cudaLaunchKernel
                    0.00%  205.33us       114  1.8010us     117ns  95.307us  cuD

# Grid-Stride Loop with Prefetch and GPU Data Initialization

In [155]:
%%writefile CUDA_asum6.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

__global__
void init_array(size_t n, double* a) {
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;
    for (size_t i = index; i < n; i += stride) {
        a[i] = sin((double)i * 0.0003) * cos((double)i * 0.0007) * 1000.0;
    }
}

__global__
void asum(size_t n, double* a, double *res) {
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;
    for (size_t i = index; i < n; i += stride)
        atomicAdd(res, fabs(a[i]));
}

int main() {
    const size_t N = 28;
    const size_t ARRAY_SIZE = 1 << N;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(double);
    const size_t loope = 10;

    double *a, *res;
    cudaMallocManaged(&a, ARRAY_BYTES);
    cudaMallocManaged(&res, sizeof(double));

    int device = -1;
    cudaGetDevice(&device);

    // Prefetch to GPU before initializing
    cudaMemPrefetchAsync(a, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(res, sizeof(double), device, NULL);

    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;

    // GPU kernel to initialize array
    init_array<<<numBlocks, numThreads>>>(ARRAY_SIZE, a);
    cudaDeviceSynchronize();

    *res = 0.0;

    printf("*** function = Double asum (GPU init data)\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i=0; i<loope;i++){
      *res = 0.0;
      asum <<<numBlocks, numThreads>>> (ARRAY_SIZE,a,res);
      cudaDeviceSynchronize();
    }

    // Prefetch back to CPU for validation
    cudaMemPrefetchAsync(a, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(res, sizeof(double), cudaCpuDeviceId, NULL);

    printf("Absolute sum of vector size 2^%lu: %lf \n", N, *res);

    // Verify result on CPU
    double err_res = 0.0;
    for (size_t i = 0; i < ARRAY_SIZE; i++)
        err_res += fabs(a[i]);

    if (fabs(err_res - *res) > 1e-2)
        printf("Error encountered: \n Function result: %lf \n Error checking result: %lf \n Error difference: %lf \n", *res, err_res, (err_res - *res));
    else
        printf("No errors encountered\n");

    cudaFree(a);
    cudaFree(res);
    return 0;
}

Overwriting CUDA_asum6.cu


In [156]:
%%bash
nvcc CUDA_asum6.cu -lm -o CUDA_asum6 -Wno-deprecated-gpu-targets -arch=sm_60

In [157]:
%%bash
nvprof ./CUDA_asum6

==995205== NVPROF is profiling process 995205, command: ./CUDA_asum6


*** function = Double asum (GPU init data)
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Absolute sum of vector size 2^28: 108762865473.984299 
No errors encountered


==995205== Profiling application: ./CUDA_asum6
==995205== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.95%  6.14571s        10  614.57ms  614.30ms  614.71ms  asum(unsigned long, double*, double*)
                    0.05%  3.2484ms         1  3.2484ms  3.2484ms  3.2484ms  init_array(unsigned long, double*)
      API calls:   80.74%  6.14928s        11  559.03ms  3.2807ms  614.83ms  cudaDeviceSynchronize
                   13.41%  1.02105s         4  255.26ms  148.00us  1.00353s  cudaMemPrefetchAsync
                    5.00%  380.48ms         2  190.24ms  264.03us  380.22ms  cudaMallocManaged
                    0.82%  62.749ms         2  31.374ms  292.64us  62.456ms  cudaFree
                    0.03%  2.0772ms        11  188.83us  73.388us  922.05us  cudaLaunchKernel
                    0.00%  294.08us       114  2.5790us     119ns  138.29us  cuDeviceGetAttribute
                    0.00%  54.384us         1