<a href="https://colab.research.google.com/github/czarinadamienne/GRP6_CUDAPROJECT/blob/main/GRP6_CUDAPROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing **Histogram Counting** and using **Shared memory concept** in CUDA

### Check CUDA

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

## (1) C Implementation

In [None]:
%%writefile C_histcount.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

//histogram counting
void histcount(size_t n, int32_t* histbins, int32_t *vec){
  int index = 0;
  for (int i = 0; i < n; i++){
    index = vec[i] % 10;
    histbins[index]++;
  }
}

int main(int argc, char** argv){
  const size_t size = 10; //change to 1 << 28
  const size_t arr_bytes = size * sizeof(int32_t);

  const size_t repeat = 30;

  //dynamically allocate
  int32_t *vec;
  vec = (int32_t*)malloc(arr_bytes);
  int32_t histbins[10] = {0};

  //time test
  clock_t start, end;

  //initialize arrays with index value
  for (int i = 0; i < size; i++){
    vec[i] = (int32_t) i;
  }

  //avoid cache miss
  histcount(size, histbins, vec);
  for (int i = 0; i < size; i++){ //clear the histogram
        histbins[i] = 0;
  }

  //timer
  double elapse, time_taken;
  elapse = 0.0f;

  for(int i = 0; i < repeat; i++){
    start = clock();
    histcount(size, histbins, vec);
    end = clock();
    time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
    elapse += time_taken;

    if (i < repeat - 1){ //clear the histogram
        for (int i = 0; i < size; i++){
            histbins[i] = 0;
        }
    }
  }

  printf("Historgram Bins: \n");
  for (int i = 0; i < 10; i++){
    printf("Historgram Bin #%d: %d\n", i + 1, histbins[i]);
  }

  printf("\n\nC function:\n");
  printf("Average execution time: %f milliseconds\n", elapse/repeat);
  printf("Number of runs: %lu\n", repeat);
  printf("Array size: %lu", size);

  int ind = 0;
  int32_t histcheck[10] = {0};
  for (int i = 0; i < size; i++){
    ind = vec[i] % 10;
    histcheck[ind]++;
  }

  /*printf("\n\nVector: ");
  for (int i = 0; i < size; i++){
    printf("%d", vec[i]);
  }*/

  //error checker
  size_t error = 0;
  for (int i = 0; i < 10; i++){
    if (histbins[i] != histcheck[i]){
      error++;
    }
  }

  printf("\n\nNumber of errors in C program: %lu\n\n", error);

  //free memory
  free(vec);

  return 0;
}

In [None]:
%%shell
gcc C_histcount.c -o C_histcount
./C_histcount

## (2) CUDA Implementation

In [None]:
%%writefile CUDA_histcount.cu

#include <stdio.h>
#include <stdlib.h>

//CUDA histcount kernel
__global__ void histcount(size_t n, int32_t* histbins, int32_t* vec) {
    int ind = 0;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = index; i < n; i += stride) {
        ind = vec[i] % 10;
        histbins[ind]++;
    }
}

int main() {
    const size_t size = 10; //change to 1 << 28
    const size_t arr_bytes = size * sizeof(int32_t);
    const size_t hist_bytes = 10 * sizeof(int32_t);

    const size_t repeat = 30;

    //cuda allocate cpu and gpu memory
    int32_t *vec, *histbins;
    cudaMallocManaged(&vec, arr_bytes);
    cudaMallocManaged(&histbins, hist_bytes);

	//initialize histogram bins to 0
    cudaMemset(histbins, 0, hist_bytes);

    //get gpu in
    int device = -1;
    cudaGetDevice(&device);

    //mem advise
    cudaMemAdvise(vec, arr_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(vec, arr_bytes, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);


    //prefetch to create CPU page memory
    cudaMemPrefetchAsync(vec, arr_bytes, cudaCpuDeviceId, NULL);

    //prefetch to create GPU page memory
    cudaMemPrefetchAsync(histbins, hist_bytes, device, NULL);

    //initialize arrays with index value
    for (int i = 0; i < size; i++) {
        vec[i] = (int32_t)i;
    }

    //prefetching CPU-GPU
    cudaMemPrefetchAsync(vec, arr_bytes, device, NULL);
	cudaMemPrefetchAsync(histbins, hist_bytes, device, NULL);

    //cuda kernel
    size_t threads = 256;
    size_t blocks = (size + threads - 1) / threads;

    for (int i = 0; i < repeat; i++) {
        histcount <<<blocks, threads>>> (size, histbins, vec);

        if (i < repeat - 1) { //clear the histogram
            cudaMemset(histbins, 0, hist_bytes);
        }
    }

    cudaDeviceSynchronize(); //wait GPU to finish

    //prefetch from gpu-cpu
    cudaMemPrefetchAsync(histbins, hist_bytes, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(vec, arr_bytes, cudaCpuDeviceId, NULL);

    printf("Historgram Bins: \n");
    for (int i = 0; i < 10; i++) {
        printf("Historgram Bin #%d: %d\n", i + 1, histbins[i]);
    }

    printf("\n\nCUDA kernel:\n");
    printf("Number of blocks: %lu\n", blocks);
    printf("Number of threads: %lu\n", threads);
    printf("Number of runs: %lu\n", repeat);
    printf("Array size: %lu\n\n", size);

    int indcheck = 0;
    int32_t histcheck[10] = { 0 };
    for (int i = 0; i < size; i++) {
        indcheck = vec[i] % 10;
        histcheck[indcheck]++;
    }

    //error checker
    size_t error = 0;
    for (int i = 0; i < 10; i++) {
        if (histbins[i] != histcheck[i]) {
            error++;
        }
    }

    printf("Number of errors in CUDA program: %zu\n\n", error);

    //free memory
    cudaFree(vec);
    cudaFree(histbins);

    return 0;
}

In [None]:
%%shell
nvcc -o CUDA_histcount CUDA_histcount.cu
nvprof ./CUDA_histcount.cu