In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-9shl3zbw
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-9shl3zbw
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=9247f813c407502f9a6b29b060020e10bfe689e58d4bc1b964dbf4247ab3ae33
  Stored in directory: /tmp/pip-ephem-wheel-cache-f4lnwenp/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
//vector addition CUDA  1
#include<iostream>
#include<time.h>
#include<cuda.h>
#define SIZE 100000
using namespace std;

__global__ void addVect(int *vect1 ,int *vect2 , int *resultVect){
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    resultVect[i] = vect1[i] + vect2[i];
}

int main(){
    int *d_inVect1,*d_inVect2,*d_outResultVector;   //data storage for gpu
    int vect1[SIZE],vect2[SIZE],resultVect[SIZE];   // data storage for cpu
    cudaEvent_t gpu_start,gpu_stop;
    float gpu_elapsed_time;

    // Initializing both the vectors
    for(int i = 0 ; i < SIZE ; i++){
        vect1[i] = i;
        vect2[i] = i;
    }
    // Parallel code

    // Allocate memory on GPU for 3 vectors
    cudaMalloc((void**)&d_inVect1,SIZE*(sizeof(int)));
    cudaMalloc((void**)&d_inVect2,SIZE*(sizeof(int)));
    cudaMalloc((void**)&d_outResultVector,SIZE*(sizeof(int)));

    // COPY the vector contents
    cudaMemcpy(d_inVect1,vect1,SIZE*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_inVect2,vect2,SIZE*sizeof(int),cudaMemcpyHostToDevice);

    // Start record for gpu_start
    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);

    //blk is number of blocks with each block of 1024 threads
    int blk = SIZE/1024;
    // Call the kernel
    addVect<<<blk+1,1024>>>(d_inVect1,d_inVect2,d_outResultVector);
    cudaDeviceSynchronize();
    cudaEventRecord(gpu_stop,0);
    // Copy gpu mem to cpu mem
    cudaMemcpy(resultVect,d_outResultVector,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
    
    //cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time,gpu_start,gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);

    cout<<"The time taken by GPU is :"<<gpu_elapsed_time<<endl;
    
    // verify that the GPU did the work we requested
    bool success = true;
    int total=0;
    cout<<"\n Checking "<<SIZE<<" values in the array.......\n";
    for (int i=0; i<SIZE; i++) {
        if ((vect1[i] + vect2[i]) != resultVect[i]) {
            printf( "Error:  %d + %d != %d\n", vect1[i], vect2[i], resultVect[i] );
            success = false;
        }
        total += 1;
    }
    if (success)  cout<<"We did it "<<total<<"  values correct!\n";

    // Sequential code of vector addition with time measurement
    clock_t startTime = clock();
    int resultVect2[SIZE];
    for(int i = 0 ; i < SIZE ; i++){
        resultVect2[i] = vect1[i] + vect2[i];
    }
    clock_t endTime = clock();
     cout<<"\nTime for sequential: "<<((float)(endTime-startTime)/CLOCKS_PER_SEC)*1000;
     cout<<"\nAll results are correct!!!, \n Speedup = "<<((float)(endTime-startTime)/CLOCKS_PER_SEC)*1000 / gpu_elapsed_time<<"\n";
     // free the memory we allocated on the GPU
     cudaFree(d_inVect1);
     cudaFree(d_inVect2);
     cudaFree(d_outResultVector);
  
    return 0;
}

The time taken by GPU is :0.026048

 Checking 100000 values in the array.......
We did it 100000  values correct!

Time for sequential: 0.316
All results are correct!!!, 
 Speedup = 12.1314



In [None]:
%%cu 
#include <stdio.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"

// Define matrix size
#define N 16

__global__ void matrix_multiply(float *a, float *b, float *c) {
    // Calculate thread index
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Calculate the product of two matrices
    float sum = 0;
    for (int i = 0; i < N; i++) {
        sum += a[row * N + i] * b[i * N + col];
    }
    c[row * N + col] = sum;
}

void matrixMultiplication(float *a ,float *b ,float *c) {
 
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0;
            for (int k = 0; k < N; k++) {
                sum += a[i * N+ k] * b[k * N + j];
            }
            c[i * N + j] = sum;
        }
    }
       // Print the result matrix
          printf("\nMatrix result using normal function : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    printf("\n-----------------------------------------------------------------------");
}



int main() {
    float *a, *b, *c,*d;  // Pointers to matrices in host memory
    float *dev_a, *dev_b, *dev_c;  // Pointers to matrices in device memory
    int size = N * N * sizeof(float);

    // Allocate memory for matrices in host memory
    a = (float *)malloc(size);
    b = (float *)malloc(size);
    c = (float *)malloc(size);
    d = (float *)malloc(size);

    // Initialize matrices with random values
    for (int i = 0; i < N * N; i++) {
        a[i] = rand() % 100;
        b[i] = rand() % 100;
    }

    // Allocate memory for matrices in device memory
    cudaMalloc((void **)&dev_a, size);
    cudaMalloc((void **)&dev_b, size);
    cudaMalloc((void **)&dev_c, size);

    // Copy matrices from host memory to device memory
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

    // Define the grid and block dimensions
    dim3 dimGrid(N / 16, N / 16);
    dim3 dimBlock(16, 16);

    // Call the kernel function

    clock_t tic, toc;
tic = clock();
   matrix_multiply<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
   cudaDeviceSynchronize();
    toc = clock();

    float timeTakenGPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;

    // Copy the result matrix from device memory to host memory
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

       // Print the A matrix
       printf("Matrix A : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", a[i * N + j]);
        }
        printf("\n");
    }
    printf("\n---------------------------------------------------------------------------------\n");

       // Print the B matrix
       printf("Matrix B : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", b[i * N + j]);
        }
        printf("\n");
    }
      printf("\n---------------------------------------------------------------------------------");

    // normal 
     // CPU
   

    tic = clock();
    matrixMultiplication(a,b,d);
    toc = clock();

  float timeTakenCPU =(float) ((toc - tic)) / CLOCKS_PER_SEC;
    

    // Print the result matrix parallel
       printf("\nMatrix Result using cuda : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    printf("----------------------------------------------------------------------------------\n");

    // Free memory
    free(a);
    free(b);
    free(c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    printf("\n ");
   printf("CPU Time: %f \n", timeTakenCPU);
   printf("GPU Time: %f \n", timeTakenGPU);
   printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);

    return 0;
}

Matrix A : 
83.000000 77.000000 93.000000 86.000000 49.000000 62.000000 90.000000 63.000000 40.000000 72.000000 11.000000 67.000000 82.000000 62.000000 67.000000 29.000000 
22.000000 69.000000 93.000000 11.000000 29.000000 21.000000 84.000000 98.000000 15.000000 13.000000 91.000000 56.000000 62.000000 96.000000 5.000000 84.000000 
36.000000 46.000000 13.000000 24.000000 82.000000 14.000000 34.000000 43.000000 87.000000 76.000000 88.000000 3.000000 54.000000 32.000000 76.000000 39.000000 
26.000000 94.000000 95.000000 34.000000 67.000000 97.000000 17.000000 52.000000 1.000000 86.000000 65.000000 44.000000 40.000000 31.000000 97.000000 81.000000 
9.000000 67.000000 97.000000 86.000000 6.000000 19.000000 28.000000 32.000000 3.000000 70.000000 8.000000 40.000000 96.000000 18.000000 46.000000 21.000000 
79.000000 64.000000 41.000000 93.000000 34.000000 24.000000 87.000000 43.000000 27.000000 59.000000 32.000000 37.000000 75.000000 74.000000 58.000000 29.000000 
35.000000 18.000000 43.000000

In [None]:
 !nvidia-smi

Sat May 13 09:46:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces