<a href="https://colab.research.google.com/github/changhyeonnam/cuda_programming/blob/main/cuda_vector_add.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-5u_9cuje
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-5u_9cuje
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
#@title
%%cu
//
// Created by changhyeonnam on 2023/01/10.
//

#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>

// CUDA kernel for vector addition
// __global__ means this is called from CPU, and runs on the GPU
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,
                          int *__restrict c, int N){
    // Calculate global thread ID
    // blockDim = 1 dim (just integer)
    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;

    // Boundary check
    if (tid<N)
        // Each thread adds a single element
        c[tid] = a[tid] + b[tid];
}

// Initialize vector of size n to int between 0~99
void matrix_init(int* a, int n){
    for(int i=0; i<n; i++){
        a[i] = rand() % 100;
    }
}
// Check vector add result
void error_check(int* a, int* b, int* c, int n){
    for(int i=0; i<n; i++){
        assert(c[i] == a[i] + b[i]);
    }
}

// print vector add result
void print_result(int* a, int* b, int* c, int n){
    for(int i=0; i<n; i++){
        if(i%100==0)
            std::cout<<"c["<<i<<"]="<<c[i]<<" = "<<"a["<<i<<"]="<<a[i]<<" + " <<"b["<<i<<"]="<<b[i]<<'\n';
    }
}


int main(){
    // Vector size of 2^16 (65536 elements)
    int n = 1<<16;

    // Host vector pointers
    int *h_a, *h_b, *h_c;

    // Device vector pointers
    int *d_a, *d_b, *d_c;

    // Allocation size for all vectors
    size_t bytes = sizeof(int) * n;

    // Allocate host memory
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);

    // Allocate device(gpu) memory
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    /* There is something called unified memory.
     * one set of memory that gets migrated between the GPU and CPU viceversa.
     * [next lecture]
     */

    // Initialize vectors a and b with random values between 0 and 99
    matrix_init(h_a, n);
    matrix_init(h_b, n);

    // Copy data from the CPU(HOST) to the GPU
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    // Threadblock size
    // it's generally good to do this a size of 32 because these have to translate it to warps.
    // which are of size 32.
    int NUM_THREADS = 256;

    // Grid size
    // NUM_THREAD * NUM_BLOCKS = NUMBER of Elements.
    int NUM_BLOCKS = (int)ceil(n/NUM_THREADS);

    // Launch kernel on default strem w/o
    vectorAdd<<<NUM_BLOCKS, NUM_THREADS>>>(d_a, d_b, d_c, n);

    // Copy sum vector from device to host
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    // Check result for errors
    error_check(h_a, h_b, h_c, n);
    print_result(h_a, h_b, h_c, n);
    printf("COMPLETED SUCCESFULLY\n");
    return 0;
}



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
c[60538] = 1873037940= a[60538] = -1602005840 b[60538] = -819923516
c[60539] = 2018056200= a[60539] = 111999816 b[60539] = 1906056384
c[60540] = 1346569776= a[60540] = -1034275240 b[60540] = -1914122280
c[60541] = -191690464= a[60541] = 527674952 b[60541] = -719365416
c[60542] = 1444754528= a[60542] = 34477992 b[60542] = 1410276536
c[60543] = 1053245644= a[60543] = 420298100 b[60543] = 632947544
c[60544] = -1098435440= a[60544] = -141890636 b[60544] = -956544804
c[60545] = 1773781304= a[60545] = -946735468 b[60545] = -1574450524
c[60546] = 585388704= a[60546] = 1526217532 b[60546] = -940828828
c[60547] = -2123669452= a[60547] = 655117896 b[60547] = 1516179948
c[60548] = 656100872= a[60548] = 207854896 b[60548] = 448245976
c[60549] = 639628488= a[60549] = 1937685908 b[60549] = -1298057420
c[60550] = 2032269340= a[60550] = -1748835748 b[60550] = -513862208
c[60551] = -1451495088= a[60551] = 1919891232 b[60551] = 923580976
c[60552] = 47674