# ***IF YOU'RE INTERESTING IN RUNNING THE CUDA CODE PLEASE CHANGE YOUR RUNTIME FROM CPU TO GPU -------AUTHOR: CHLOE GEMAYEL***  #

In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-aje763z8
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-aje763z8
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4294 sha256=c43781aa89b8471c70aed450f46f0503e077c435841edf3cb732cf75218a4f5b
  Stored in directory: /tmp/pip-ephem-wheel-cache-u0zyy22h/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


**CUDA MATRIX MULTIPLICATION**

In [4]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

__global__ void matrixMulBasic(float *a, float *b, float *c, int M, int N, int K);

int main() {
    const int M = 3;  //Number of rows in matrix A
    const int N = 3;  //Number of columns in matrix A and rows in matrix B
    const int K = 3;

    float h_A[M * N] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
    float h_B[N * K] = {9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
    float h_C[M * K] = {0.0};

    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, M * N * sizeof(float));
    cudaMalloc((void **)&d_B, N * K * sizeof(float));
    cudaMalloc((void **)&d_C, M * K * sizeof(float));

    cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * K * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimGrid((K - 1) / 16 + 1, (M - 1) / 16 + 1, 1);
    dim3 dimBlock(16, 16, 1);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMulBasic<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, M, N, K);
    cudaEventRecord(stop);

    //Copy the result from device to host
    cudaMemcpy(h_C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);

    cudaError_t cudaErr = cudaGetLastError();
    if (cudaErr != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaErr));
        return -1;
    }

    printf("Result Matrix C:\n");
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < K; ++j) {
            printf("%.2f\t", h_C[i * K + j]);
        }
        printf("\n");
    }

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Execution Time: %f ms\n", milliseconds);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

__global__ void matrixMulBasic(float *a, float *b, float *c, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        float sum = 0.0f;
        for (int i = 0; i < N; ++i) {
            sum += a[row * N + i] * b[i * K + col];
        }
        c[row * K + col] = sum;
    }
}



Result Matrix C:
30.00	24.00	18.00	
84.00	69.00	54.00	
138.00	114.00	90.00	
Execution Time: 0.022496 ms



**SEQUENTIAL MATIRX MULTIPLICATION**

In [7]:
%%cu
#include <stdio.h>
#include <time.h>

void matrixMulSequential(float *a, float *b, float *c, int M, int N, int K);

int main() {
    const int M = 3;  //Number of rows in matrix A
    const int N = 3;  //Number of columns in matrix A (and rows in matrix B)
    const int K = 3;  //Number of columns in matrix B

    float h_A[M * N] = {1.0, 2999999999999.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
    float h_B[N * K] = {9.0, 8.0, 7.0, 6.0, 5.0, 4999999999.0, 3.0, 2.0, 1.0};
    float h_C[M * K] = {0.0};

    clock_t start = clock();

    matrixMulSequential(h_A, h_B, h_C, M, N, K);

    clock_t end = clock();

    printf("Result Matrix C:\n");
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < K; ++j) {
            printf("%.2f\t", h_C[i * K + j]);
        }
        printf("\n");
    }

    double seconds = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("Execution Time: %f seconds\n", seconds);

    return 0;
}

void matrixMulSequential(float *a, float *b, float *c, int M, int N, int K) {
    for (int row = 0; row < M; ++row) {
        for (int col = 0; col < K; ++col) {
            float sum = 0.0;
            for (int i = 0; i < N; ++i) {
                sum += a[row * N + i] * b[i * K + col];
            }
            c[row * K + col] = sum;
        }
    }
}


Result Matrix C:
18000000319488.00	15000000266240.00	15000000793194369384448.00	
84.00	69.00	24999999488.00	
138.00	114.00	40000000000.00	
Execution Time: 0.000001 seconds



**SEQUENTIAL MATRIX MULTIPLICATION WITH TILING**

In [8]:
%%cu
#include <stdio.h>
#include <stdlib.h>

void matrixMulSequentialTiled(float *a, float *b, float *c, int M, int N, int K, int tile_size);

int main() {
    const int M = 3;  //Number of rows in matrix A
    const int N = 3;  //Number of columns in matrix A (and rows in matrix B)
    const int K = 3;  //Number of columns in matrix B
    const int tile_size = 2;  //Size of the tile

    float h_A[M * N] = {100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0, 900.0};
    float h_B[N * K] = {900000.0, 800000.0, 7.0, 6.0, 5.0, 4.0, 39999.0, 2.0, 1.0};
    float h_C[M * K] = {0.0};

    clock_t start = clock();
    matrixMulSequentialTiled(h_A, h_B, h_C, M, N, K, tile_size);
    clock_t end = clock();

    printf("Result Matrix C:\n");
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < K; ++j) {
            printf("%.2f\t", h_C[i * K + j]);
        }
        printf("\n");
    }

    double seconds = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("Execution Time: %f seconds\n", seconds);

    return 0;
}

void matrixMulSequentialTiled(float *a, float *b, float *c, int M, int N, int K, int tile_size) {
    for (int row = 0; row < M; row += tile_size) {
        for (int col = 0; col < K; col += tile_size) {
            for (int inner = 0; inner < N; inner += tile_size) {
                for (int i = row; i < row + tile_size && i < M; ++i) {
                    for (int j = col; j < col + tile_size && j < K; ++j) {
                        float sum = 0.0;
                        for (int k = inner; k < inner + tile_size && k < N; ++k) {
                            sum += a[i * N + k] * b[k * K + j];
                        }
                        c[i * K + j] += sum;
                    }
                }
            }
        }
    }
}


Result Matrix C:
102000896.00	80001600.00	1800.00	
384002400.00	320003712.00	5400.00	
666003904.00	560005760.00	9000.00	
Execution Time: 0.000001 seconds



**CUDA MATRIX MULTIPLICATION WITH TILING**

In [9]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define TILE_SIZE 16

__global__ void matrixMulTiled(float *a, float *b, float *c, int M, int N, int K);

int main() {
    const int M = 3;  //Number of rows in matrix A
    const int N = 3;  //Number of columns in matrix A (and rows in matrix B)
    const int K = 3;  //Number of columns in matrix B

    float h_A[M * N] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
    float h_B[N * K] = {9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
    float h_C[M * K] = {0.0};

    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, M * N * sizeof(float));
    cudaMalloc((void **)&d_B, N * K * sizeof(float));
    cudaMalloc((void **)&d_C, M * K * sizeof(float));

    cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * K * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimGrid((K - 1) / TILE_SIZE + 1, (M - 1) / TILE_SIZE + 1, 1);
    dim3 dimBlock(TILE_SIZE, TILE_SIZE, 1);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMulTiled<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, M, N, K);
    cudaEventRecord(stop);

    cudaMemcpy(h_C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);

    cudaError_t cudaErr = cudaGetLastError();
    if (cudaErr != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaErr));
        return -1;
    }

    printf("Result Matrix C:\n");
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < K; ++j) {
            printf("%.2f\t", h_C[i * K + j]);
        }
        printf("\n");
    }
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Execution Time: %f ms\n", milliseconds);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

__global__ void matrixMulTiled(float *a, float *b, float *c, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        float sum = 0.0f;
        for (int tile = 0; tile < N; tile += TILE_SIZE) {
            __shared__ float tile_a[TILE_SIZE][TILE_SIZE];
            __shared__ float tile_b[TILE_SIZE][TILE_SIZE];

            //Load tiles into shared memory
            tile_a[threadIdx.y][threadIdx.x] = a[row * N + tile + threadIdx.x];
            tile_b[threadIdx.y][threadIdx.x] = b[(tile + threadIdx.y) * K + col];

            //Synchronize to make sure the tiles are loaded
            __syncthreads();

            //Compute partial sum within the tile
            for (int i = 0; i < TILE_SIZE; ++i) {
                sum += tile_a[threadIdx.y][i] * tile_b[i][threadIdx.x];
            }

            //Synchronize before loading the next tile
            __syncthreads();
        }

        // Write the final result to the output matrix
        c[row * K + col] = sum;
    }
}


Result Matrix C:
30.00	24.00	18.00	
84.00	69.00	54.00	
138.00	114.00	90.00	
Execution Time: 0.022208 ms

