# Introduction to CUDA programming


### Check the cuda compiler version

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!git clone https://github.com/NVIDIA/cuda-samples.git

In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && make


In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && ls
!cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery

## nvcc for Jupyter notebook

In [None]:
!pip install nvcc4jupyter

In [None]:
%load_ext nvcc4jupyter

In [None]:
%%cuda
#include <iostream>
int main()
{
    std::cout << "Hello World";
    return 0;
}

In [None]:
%%cuda
#include <iostream>
__global__ void kernel( void ){

}
int main( void ) {
    kernel<<<1,1>>>();
    printf( "Hello, World!\n" );
    return 0;
    }

In [None]:
%%cuda
// Adding two number in GPU
#include <iostream>

__global__ void add(int a, int b, int *c)
{
    *c = a + b;
    }
int main( void )
{
    int c;
    int *dev_c;
    cudaMalloc( (void**)&dev_c, sizeof(int) );
    add<<<1,1>>>( 28, 17, dev_c );
    cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
    printf( "Sum from GPU = %d\n", c );
    cudaFree( dev_c );
    return 0;
}

In [None]:
%%cuda
#define N 10

#include<iostream>
using namespace std;
__global__ void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
      out[i] = a[i] + b[i];
    }
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;out[i] =0.0f;
    }

//    // Allocate memory
      cudaMalloc((void**)&d_a, sizeof(float) * N);
      cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);

      cudaMalloc((void**)&d_b, sizeof(float) * N);
      cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

      cudaMalloc((void**)&d_out, sizeof(float) * N);
      cudaMemcpy(d_out, out, sizeof(float) * N, cudaMemcpyHostToDevice);

      // Main function
      vector_add<<<1,1>>>(d_out, d_a, d_b, N);

      cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

      for (int idx = 0; idx < N; idx++)
      {
        std::cout << a[idx] << " , ";
        std::cout << b[idx] << " , ";
        std::cout << out[idx] << " , ";
      }

      cudaFree(d_a);
      free(a);
      cudaFree(d_b);
      free(b);
      cudaFree(d_out);
      free(out);
}

In [None]:
%%writefile vector_add.cu
#define N 10

#include<iostream>
using namespace std;
__global__ void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
      out[i] = a[i] + b[i];
    }
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;out[i] =0.0f;
    }

//    // Allocate memory
      cudaMalloc((void**)&d_a, sizeof(float) * N);
      cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);

      cudaMalloc((void**)&d_b, sizeof(float) * N);
      cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

      cudaMalloc((void**)&d_out, sizeof(float) * N);
      cudaMemcpy(d_out, out, sizeof(float) * N, cudaMemcpyHostToDevice);

      // Main function
      vector_add<<<1,1>>>(d_out, d_a, d_b, N);

      cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

      for (int idx = 0; idx < N; idx++)
      {
        std::cout << a[idx] << " , ";
        std::cout << b[idx] << " , ";
        std::cout << out[idx] << " , ";
      }

      cudaFree(d_a);
      free(a);
      cudaFree(d_b);
      free(b);
      cudaFree(d_out);
      free(out);
}

In [None]:
%%cuda
/* Hello World cuda program*/
#include <iostream>
#include <cuda.h>
#include <stdio.h>

using namespace std;

__global__ void myKernel() {
    printf("GPU: Hello World.\n"); // cout doesnot work in device code
}

int main(){
    std::cout << "CPU : Hello World";
    myKernel<<<1, 1>>>();
    return 0;
}

In [None]:
!nvcc vector_add.cu -o vector_add

In [None]:
!nvprof ./vector_add

In [None]:
%%cuda
#include <iostream>
#include <cuda.h>
#include <stdio.h>

using namespace std;

__global__ void myKernel() {
    printf("GPU: Hello World.\n"); // Avoid print statements inside the kernel function
}

int main(){
    std::cout << "CPU : Hello World";
    myKernel<<<1, 1>>>();
    cudaThreadSynchronize();
    return 0;
}

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
__global__ void myKernel(){
    printf("Hello World.\n");
  }
int main() {
    myKernel<<<1, 32>>>();
    cudaThreadSynchronize();
    return 0;
  }

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
#define N 10

__device__ int getID(){
    return blockIdx.x*blockDim.x + threadIdx.x;
}
__global__ void myKernel() {
    int a = getID();
     printf("%d - %d\n", threadIdx.x,a);
     }
int main() {
    myKernel<<<N, 1>>>();
    cudaThreadSynchronize();
    return 0; }

# Typical Cuda program
- Load data to CPU memory
- Transfer data to GPU memory
- Lauch kernels to act on the data
- Transfer back data to CPU memory
- Proceed with CPU execution

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
#define N 10
__global__ void scaleArr(int *a){
      a[threadIdx.x] = threadIdx.x * 10;
}

int main() {
    int a_hs[N], *a_dev;
    int i;
    cudaMalloc(&a_dev, N * sizeof(int));

    scaleArr<<<1, N>>>(a_dev);
    cudaMemcpy(a_hs, a_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    for (i = 0; i < N; ++i)
      printf("%d\n", a_hs[i]);
    return 0;
  }

## Finding the Unique ID of different threads?

https://github.com/tpn/pdfs/blob/master/CUDA%20Thread-Indexing%20Cheatsheet.pdf