In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [34]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [35]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-t4p8gmm_
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-t4p8gmm_


In [36]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
%%writefile matMul.cu
#include<stdio.h>
#include<cuda.h>
#define row1 2 /* Number of rows of first matrix */
#define col1 3 /* Number of columns of first matrix */
#define row2 3 /* Number of rows of second matrix */
#define col2 2 /* Number of columns of second matrix */
__global__ void matproduct(int *l,int *m, int *n)
{
	int x=blockIdx.x;
	int y=blockIdx.y;
	int k;
	n[col2*y+x]=0;
	for(k=0;k<col1;k++)
	{
		n[col2*y+x]=n[col2*y+x]+l[col1*y+k]*m[col2*k+x];
	} 
}

int main()
{
	int a[row1][col1]={{1,2,3},{1,2,3}};
	int b[row2][col2]={{1,2},{3,4},{5,6}};
	int c[row1][col2];
	int *d,*e,*f;
	int i,j;

	cudaMalloc((void **)&d,row1*col1*sizeof(int));
	cudaMalloc((void **)&e,row2*col2*sizeof(int));
	cudaMalloc((void **)&f,row1*col2*sizeof(int));
	cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);
	dim3 grid(col2,row1);

	/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is
	dim3 grid(no. of columns,no. of rows) */
	
	matproduct<<<grid,1>>>(d,e,f);
	cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
	printf("\nProduct of two matrices:\n");
	for(i=0;i<row1;i++)
	{
		for(j=0;j<col2;j++)
		{
			printf("%d\t",c[i][j]);
		}
		printf("\n");
	}
	cudaFree(d);
	cudaFree(e);
	cudaFree(f);
	return 0; 
}

In [20]:
!nvcc -o multiply matMul.cu

In [21]:
! ./multiply


Product of two matrices:
22	28	
22	28	


In [None]:
!nvprof ./multiply

In [49]:
%%writefile optimize.cu
#include<stdio.h>
#include<cuda.h>
#define row1 2 
#define col1 3 
#define row2 3 
#define col2 2 

__global__ void matproductsharedmemory(int *l,int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    __shared__ int p[col1];

    int i;
    int k=threadIdx.x;

    n[col2*y+x]=0;

   p[k]=l[col1*y+k]*m[col2*k+x];

  __syncthreads();
 for(i=0;i<col1;i++)
  n[col2*y+x]=n[col2*y+x]+p[i];
}

int main()
{
    int a[row1][col1]={{1,2,3},{4,5,6}};
    int b[row2][col2]={{1,2},{3,4},{5,6}};;
    int c[row1][col2];
    int *d,*e,*f;
    int i,j;

   cudaMalloc((void **)&d,row1*col1*sizeof(int));
    cudaMalloc((void **)&e,row2*col2*sizeof(int));
    cudaMalloc((void **)&f,row1*col2*sizeof(int));

 cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

dim3 grid(col2,row1);

matproductsharedmemory<<<grid,col1>>>(d,e,f);

 cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);

 printf("\n Product of two matrices:\n ");
    for(i=0;i<row1;i++)
    {
        for(j=0;j<col2;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}

Overwriting optimize.cu


In [50]:
!nvcc -o opt optimize.cu

In [51]:
! ./opt


 Product of two matrices:
 22	28	
49	64	


In [52]:
!nvprof --print-gpu-trace ./opt

==34563== NVPROF is profiling process 34563, command: ./opt

 Product of two matrices:
 22	28	
49	64	
==34563== Profiling application: ./opt
==34563== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
413.56ms  2.3360us                    -               -         -         -         -       24B  9.7980MB/s    Pageable      Device    Tesla K80 (0)         1         7  [CUDA memcpy HtoD]
413.58ms  1.5360us                    -               -         -         -         -       24B  14.901MB/s    Pageable      Device    Tesla K80 (0)         1         7  [CUDA memcpy HtoD]
413.80ms  4.5120us              (2 2 1)         (3 1 1)        14       12B        0B         -           -           -           -    Tesla K80 (0)         1         7  matproductsharedmemory(int*, int*, int*) [110]
413.81ms  2.3680us                    -               -      

In [57]:
%%cu
#include<stdio.h>
#include<cuda.h>
__global__ void arradd(int *x,int *y, int *z)
{
int id=blockIdx.x;
z[id]=x[id]+y[id];
}
int main()
{
int a[6];
int b[6];
int c[6];
int *d,*e,*f;
int i;
 printf("Program for Private Memory");
printf("\n elements of first array: ");
for(i=0;i<6;i++)
{
a[i]=i;
printf("%d ",a[i]);
}
printf("\n elements of second array: ");
for(i=0;i<6;i++)
{
b[i]=i;
printf("%d ",b[i]);
}
cudaMalloc((void **)&d,6*sizeof(int));
cudaMalloc((void **)&e,6*sizeof(int));
cudaMalloc((void **)&f,6*sizeof(int));
cudaMemcpy(d,a,6*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,6*sizeof(int),cudaMemcpyHostToDevice);
arradd<<<6,1>>>(d,e,f);
cudaMemcpy(c,f,6*sizeof(int),cudaMemcpyDeviceToHost);
printf("\nSum of two arrays:\n ");
for(i=0;i<6;i++)
{
printf("%d\t",c[i]);
}
cudaFree(d);
cudaFree(e);
cudaFree(f);
return 0;
}

Program for Private Memory
 elements of first array: 0 1 2 3 4 5 
 elements of second array: 0 1 2 3 4 5 
Sum of two arrays:
 0	2	4	6	8	10	


In [103]:
%%cu
#include<iostream>
#include<cuda.h>
__managed__ int x[10];
__global__ void GPU_func( )
{
for (int i = 0; i < 10; i++ )
{
printf("%d ", x[i]);
x[i] = x[i] + i;
}
printf("\n");
}
int main()
{
for (int i = 0; i < 10; i++ )
x[i] = i;
GPU_func<<< 1, 1 >>>( );
cudaDeviceSynchronize();
for (int i = 0; i < 10; i++ )
{
printf("%d ", x[i]);
}
printf("\n");
return 0;
}

0 1 2 3 4 5 6 7 8 9 
0 2 4 6 8 10 12 14 16 18 



In [105]:
%%cu
#include<stdio.h>
#include<cuda.h>
#define row1 2 
#define col1 3 
#define row2 3 
#define col2 2 

__global__ void matproductsharedmemory(int *l,int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    __shared__ int p[col1];

    int i;
    int k=threadIdx.x;

    n[col2*y+x]=0;

   p[k]=l[col1*y+k]*m[col2*k+x];

  __syncthreads();
 for(i=0;i<col1;i++)
  n[col2*y+x]=n[col2*y+x]+p[i];
}

int main()
{
    printf("Program demonstrating shared memory\n");
    int a[row1][col1]={{1,2,3},{4,5,6}};
    int b[row2][col2]={{1,2},{3,4},{5,6}};;
    int c[row1][col2];
    int *d,*e,*f;
    int i,j;

   cudaMalloc((void **)&d,row1*col1*sizeof(int));
    cudaMalloc((void **)&e,row2*col2*sizeof(int));
    cudaMalloc((void **)&f,row1*col2*sizeof(int));

 cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

dim3 grid(col2,row1);

matproductsharedmemory<<<grid,col1>>>(d,e,f);

 cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);

 printf("\n Product of two matrices:\n ");
    for(i=0;i<row1;i++)
    {
        for(j=0;j<col2;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}

Program demonstrating shared memory

 Product of two matrices:
 22	28	
49	64	

