<a href="https://colab.research.google.com/github/ggruszczynski/gpu_colab/blob/main/example20_element_wise_Matrix_Add.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Element-wise Matrix Add

As a step by step instruction has been presented in tutorial 2, here is a time for a stand-alone practice.

Accelerate the serial, element-wise square matrix addition code using cuda kernel.

In [None]:
%%file matrix_add.cu

#include <stdio.h>


cpu_add_matrix_elementwise (float* a, float* b, float* c, int N)
{
    int index;

    for (int i=0; i <N; ++i)
        for (int j=0; j <N; ++j)
        {
            index = i + j*N
            c[index] = a[index] + b[index];
        }
}

void print_matrix(float *Matrix, const int N)
{
    for (int i=0; i <N; ++i)
    {
        printf("\n");
        for (int j=0; j <N; ++j)
        {
            int index = i + j*N;
            printf(" %f ",Matrix[index]);
        }
    }  
}

void CPU_version_wrapper(const int N)
{
    const int mem_size = N*N*sizeof(float);

    float* A = (float*)malloc(mem_size);
    float* B = (float*)malloc(mem_size);
    float* C = (float*)malloc(mem_size);

    // initialize data
    for (int i=0; i <N; ++i)
    {
        for (int j=0; j <N; ++j)
        {
            int index = i + j*N;
            A[index] = 2.*index;
            B[index] = 3.*index;
        }
    }

    // run calculations
    cpu_add_matrix_elementwise(A,B,C,N);
    print_matrix(C, N);

    // Free memory
    free(A); free(B); free(C);
}

int main(){
    const int N = 8;
    CPU_version_wrapper(N);
    printf("\n----------------------------------\n");
    //GPU_version_wrapper(N);
    printf("\n");
    return 0;
}

In [None]:
!echo "Check your GPU version"
!nvidia-smi

Check your GPU version
Wed Feb 23 17:54:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------------------------------------------

In [None]:
%%bash

CUDA_SUFF=35
nvcc -gencode arch=compute_${CUDA_SUFF},code=sm_${CUDA_SUFF} ./matrix_add.cu -o matrix_add
./matrix_add