<a href="https://colab.research.google.com/github/chi-yan/notebooks/blob/master/CUDA_in_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Running code from 

https://www.udemy.com/course/introduction-to-parallel-programming-using-gpgpu-and-cuda/learn/lecture/8077270#learning-tools

in Google Colab environment


In [None]:
%%writefile hello_world.cu

#include "stdio.h"

__global__ void mykernel(void) {}

int main(void) {
  mykernel<<<1,1>>>();
  printf("Hello world!\n");
  return 0;
}

Overwriting hello_world.cu


In [None]:
!nvcc hello_world.cu

In [None]:
!./a.out

Hello world!


In [None]:
%%writefile add.cu

#include "stdio.h"

__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main(void) {
  int a, b, c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = sizeof(int);
  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  // Setup input values
  a = 2;
  b = 9;
  // Copy inputs to device
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  // Launch add() kernel on GPU
  add<<<1,1>>>(d_a, d_b, d_c);
  // Copy result back to host
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  // Cleanup
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
  printf("Result: %d",c);
  return 0;
}

Overwriting add.cu


In [None]:
!nvcc add.cu 
!./a.out

Result: 11

Result: 7

In [31]:
%%writefile add2.cu

#include "stdio.h"

__global__ 
void add(int *a, int *b, int *c) {
    c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
    printf("Block: %d\n", blockIdx.x);
}

void random_ints(int* a, int N) {
   int i;
   for (i = 0; i < N; ++i)
    a[i] = rand() % 100;
}
#define N 16
int main(void) {
    int *a, *b, *c; // host copies of a, b, c
    int *d_a, *d_b, *d_c; // device copies of a, b, c
    int size = N * sizeof(int);
    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);
    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    // Launch add() kernel on GPU with N blocks
    add<<<N,1>>>(d_a, d_b, d_c);
    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
    printf("Result: %d %d %d",a[4],b[4],c[4]);
    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

Overwriting add2.cu


In [32]:
!nvcc add2.cu 
!./a.out

Block: 15
Block: 9
Block: 6
Block: 3
Block: 12
Block: 0
Block: 10
Block: 4
Block: 7
Block: 1
Block: 8
Block: 14
Block: 11
Block: 13
Block: 5
Block: 2
Result: 93 11 104