# Connection
This notebook acts as the connection between my local development environment and my Google Colab virtual environment.

In [None]:
!nvidia-smi

In [2]:
%%writefile hello.cu
#include <iostream>
#include <cuda_runtime.h>

// This is the 'Kernel' - it runs on the GPU
__global__ void hello_from_gpu() {
    int block_id = blockIdx.x;
    int thread_id = threadIdx.x;
    printf("Hello from GPU! Block: %d, Thread: %d\n", block_id, thread_id);
}

int main() {
    std::cout << "Hello from the CPU!" << std::endl;

    // Launch the kernel: 2 blocks, 4 threads each
    // Syntax: kernel_name<<<blocks, threads_per_block>>>();
    hello_from_gpu<<<2, 4>>>();

    // Wait for the GPU to finish before the CPU continues
    cudaDeviceSynchronize();

    return 0;
}

Writing hello.cu


In [5]:

!nvcc hello.cu -o hello -arch=sm_75
!./hello

Hello from the CPU!
Hello from GPU! Block: 0, Thread: 0
Hello from GPU! Block: 0, Thread: 1
Hello from GPU! Block: 0, Thread: 2
Hello from GPU! Block: 0, Thread: 3
Hello from GPU! Block: 1, Thread: 0
Hello from GPU! Block: 1, Thread: 1
Hello from GPU! Block: 1, Thread: 2
Hello from GPU! Block: 1, Thread: 3


In [None]:
%%writefile hello_cuda.cu
#include <iostream>
#include <cstdlib>


__global__ void my_first_kernel(int* A, int* B, int* C) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    // The "Guard": Only do work if we are inside the array bounds
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    // allocate memory on the host and fill with data
    size_t s = 100000000;
    int* a = (int*) malloc(sizeof(int)*s);
    int* b = (int*) malloc(sizeof(int)*s);
    int* c = (int*) malloc(sizeof(int)*s);

    for (int i = 0; i < s; i++){
        a[i] = i; b[i] = i;
    }

    // allocate memory on the device 
    int *d_a, *d_b, *d_c;

    cudaMalloc((void**)&d_a, sizeof(int)*s);
    cudaMalloc((void**)&d_b, sizeof(int)*s);
    cudaMalloc((void**)&d_c, sizeof(int)*s);

    // copy the data from host to device
    // cudaMemcpy(destination, source, size, direction);
    cudaMemcpy(d_a, a, sizeof(int)*s, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(int)*s, cudaMemcpyHostToDevice);

    // launch
    int threadsPerBlock = 256;
    int blocksPerGrid = (s + threadsPerBlock - 1) / threadsPerBlock;

    my_first_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);

    cudaMemcpy(c, d_c, sizeof(int)*s, cudaMemcpyDeviceToHost);

    // Wait for GPU to finish
    cudaDeviceSynchronize();

    for (int i = 0; i < 3; ++i){
        std::cout << c[i] << std::endl;
    }

    // Free all memory allocations
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

Overwriting hello_cuda.cu


In [42]:
!nvcc hello_cuda.cu -o hello_cuda -arch=sm_75
!./hello_cuda

0
2
4
