In [1]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2
!nvcc --version
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'nvidia-kernel-common-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-325-updates' for glob 'nvidia*'
Note, selecting 'nvidia-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-binary' for glob 'nvidia*'
Note, selecting 'nvidia-331-dev' for glob 'nvidia*'
Note, selecting 'nvidia-304-updates-dev' for glob 'nvidia*'
Note, selecting 'nvidia-compute-utils-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-384-dev' for glob 'nvidia*'
Note, selecting 'nvidia-libopencl1-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-340-updates-uvm' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-common' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-source-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-331-updates-uvm' for glob 'nvidi

In [8]:
%%cu
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <random>
#include <algorithm>
#include <math.h>
#include <cuda.h>
#include <stdlib.h> 
#include <time.h> 

//this is the kernel fucntion which executes on gpu and performs matrix multplication
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    //here i just keep a check that my col and row size is less then the defined size 
    //that is if i want a matrix of 20*50
    //if my col size or row size excedes then it will not execute because its error caused due to dimensions 
    //this helps to avoid any errors before matrix multplication
    if( col < k && row < m) 
    {
        for(int i = 0; i < n; i++) 
        {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
} 

//this fuction runs on cpu and it generates a matrix on cpu which is used to verify gpu output
void cpu_matrix_mult(int *cpu_a, int *cpu_b, int *cpu_result, int m, int n, int k) {
    for (int i = 0; i < m; ++i) 
    {
        for (int j = 0; j < k; ++j) 
        {
            int tmp = 0.0;
            for (int h = 0; h < n; ++h) 
            {
                tmp += cpu_a[i * n + h] * cpu_b[h * k + j];
            }
            cpu_result[i * k + j] = tmp;
        }
    }
}

//this is the function which i call for performing matrix generation, then matrix muliplication and verification it does all this work
void matrix_function(int m,int n,int k,int epochs){
    //host is cpu and device is gpu
    // allocate memory in host(cpu) RAM, cpu_cc is used to store CPU result
    //here i generate the b matrix for multiplication
    int *cpu_b;
    cudaMallocHost((void **) &cpu_b, sizeof(int)*n*k);
    //this is the seed for rand function so random values are generated every time
    srand(time(0));   
    // random initialize matrix B
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < k; ++j) {
            cpu_b[i * k + j] = rand() % 50;
        }
    }    
    //this loop runs for number of epochs for first case its 100 so it runs 100 times
    //based on epoch value it runs for that many iterations
    for(int o=0;o<epochs;o++){
        //this is the seed for rand function so random values are generated every time
        srand(time(0));  
        int *cpu_a,*cpu_c,*cpu_cc;
        cudaMallocHost((void **) &cpu_a, sizeof(int)*m*n);
        cudaMallocHost((void **) &cpu_c, sizeof(int)*m*k);
        cudaMallocHost((void **) &cpu_cc, sizeof(int)*m*k);

        // random initialize matrix A
        for (int i = 0; i < m; ++i) {
            for (int j = 0; j < n; ++j) {
                cpu_a[i * n + j] = rand() % 50;
            }
          }
        //initialize gpu and cpu time as float 
        float gpu_elapsed_time_ms, cpu_elapsed_time_ms;

        // some events to count the execution time
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        // start to count execution time of GPU version
        cudaEventRecord(start, 0);

        // Allocate memory space on the device  
        int *gpu_a, *gpu_b, *gpu_c;
        cudaMalloc((void **) &gpu_a, sizeof(int)*m*n);
        cudaMalloc((void **) &gpu_b, sizeof(int)*n*k);
        cudaMalloc((void **) &gpu_c, sizeof(int)*m*k);
    
        // copy matrix A and B from host to device memory
        cudaMemcpy(gpu_a, cpu_a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
        cudaMemcpy(gpu_b, cpu_b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
    
        // Threads per CTA(compute thread array) dimension
        int THREADS = 32;

        // Blocks per grid dimension (assumes THREADS divides N evenly)
        int BLOCKS = n*THREADS / THREADS;

        // Use dim3 structs for block  and grid dimensions
        dim3 threads(THREADS, THREADS);
        dim3 blocks(BLOCKS, BLOCKS);

        // Launch kernel
        gpu_matrix_mult<<<blocks,threads>>>(gpu_a, gpu_b, gpu_c, m, n, k);    
        //cudaDeviceSynchronize();
          
        // Copy back to the host
        cudaMemcpy(cpu_c, gpu_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
    
        // time counting terminate
        cudaEventRecord(stop, 0);

        // compute time elapse on GPU computing
        cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed for matrix multiplication on GPU: %f ms\n",gpu_elapsed_time_ms);
    
        // start the CPU version
        cudaEventRecord(start, 0);

        //perform matrix multiplication on cpu
        cpu_matrix_mult(cpu_a, cpu_b, cpu_cc, m, n, k);

        cudaEventRecord(stop, 0);
        cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
        printf("Time elapsed for matrix multiplication on CPU: %f ms\n",cpu_elapsed_time_ms);
        // validate results computed by GPU with results generated on cpu
        int correct = 1;
        for (int i = 0; i < m; ++i){
            for (int j = 0; j < k; ++j)
            {
                //printf("[%d][%d]:%d == [%d][%d]:%d, ", i, j, cpu_cc[i*k + j], i, j, cpu_c[i*k + j]);
                //cpu_cc are results generated by matrix multiplication on cpu
                //cpu_c are results generated by matrix muliplication gpu and copied back on cpu for comparison
                if(cpu_cc[i*k + j] != cpu_c[i*k + j])
                {
                    correct = 0;
                }
            }
            //printf("\n");
        }

        // roughly compute speedup
        if(correct)
        {   
            //for each of the matrix multiplication it returns time elapsed on cpu and gpu and results of matrix mulplication
            printf("all results are correct!!! for c%d=[%dx%d]*[%dx%d], speedup = %f\n\n",o+1,m, n, n, k,cpu_elapsed_time_ms / gpu_elapsed_time_ms);
        }
        else
        {
            printf("incorrect results\n");
         }

        // free memory
        cudaFree(gpu_a);
        cudaFree(gpu_b);
        cudaFree(gpu_c);
        cudaFreeHost(cpu_a);
        cudaFreeHost(cpu_c);
        cudaFreeHost(cpu_cc);
    }
    cudaFreeHost(cpu_b);
}

//this is the function which call above function with different cases
int main(){
    //wait for 5-10  minutes to see the results as matrix is calculated on cpu too for verification
    
    //calling for case1
    int m,n,k,epochs;
    m=500,n=500,k=400,epochs=100;
    matrix_function(m,n,k,epochs);

    //calling for case2
    m=50,n=20,k=50,epochs=5000;
    matrix_function(m,n,k,epochs);
    
    //calling for case 3
    m=6,n=4000,k=9,epochs=1000;
    matrix_function(m,n,k,epochs);
    
    printf("All the matrix ouput are correct that is cpu output matches gpu output\n\n");
    printf("Sucess!!! done");
    //also google colabd by default truncates output to 5k lines so matrix_function can be called individually
    //for each of cases that is commenting other two case matrix function by // to see individual ouput of each case
    //if not commentd then it will display output of all three cases and when all outputs are correct it will print Sucess!!!done 
    return 0;
}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Time elapsed for matrix multiplication on GPU: 0.125664 ms
Time elapsed for matrix multiplication on CPU: 0.135872 ms
all results are correct!!! for c4752=[50x20]*[20x50], speedup = 1.081233

Time elapsed for matrix multiplication on GPU: 0.125664 ms
Time elapsed for matrix multiplication on CPU: 0.135872 ms
all results are correct!!! for c4753=[50x20]*[20x50], speedup = 1.081233

Time elapsed for matrix multiplication on GPU: 0.125664 ms
Time elapsed for matrix multiplication on CPU: 0.135872 ms
all results are correct!!! for c4754=[50x20]*[20x50], speedup = 1.081233

Time elapsed for matrix multiplication on GPU: 0.125664 ms
Time elapsed for matrix multiplication on CPU: 0.135872 ms
all results are correct!!! for c4755=[50x20]*[20x50], speedup = 1.081233

Time elapsed for matrix multiplication on GPU: 0.125664 ms
Time elapsed for matrix multiplication on CPU: 0.135872 ms
all results are correct!!! for c4756=[50x20]*[20x