In [104]:
# To show that if there is cuda tookit installed
!ls /usr/local

bin	   cuda-10.1  cuda-11.1  _gcs_config_ops.so  licensing	share
cuda	   cuda-11    etc	 include	     man	src
cuda-10.0  cuda-11.0  games	 lib		     sbin	xgboost


In [105]:
# To show that if we have the nvcc command
!which nvcc

/usr/local/cuda/bin/nvcc


In [106]:
# To show the property of the nvidia card(On my one, I use the K80)
!nvidia-smi

Tue Apr  5 13:57:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [107]:
# Write a cu file contain the host and kernel code
%%writefile coordinate.cu

#include <cuda_runtime.h>
#include <stdio.h>

__global__ void printCoordinate(int *A,const int nx,const int ny)
{
  int ix=threadIdx.x+blockIdx.x*blockDim.x;
  int iy=threadIdx.y+blockIdx.y*blockDim.y;
  unsigned int idx=iy*nx+ix;
  printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
          "global index %2d ival %2d\n",threadIdx.x,threadIdx.y,
          blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
}

int main(int argc,char ** argv)
{
  cudaSetDevice(0);
  // __host__ cudaError_t cudaSetDevice ( int  device )
  // Set device to be used for GPU executions. 
  // device - Device on which the active host thread should execute the device code. 

  int nx = 8;
  int ny = 6;
  // Malloc
  int* A_host = (int*)malloc(nx*ny*sizeof(int));

  for(int i = 0; i<nx*ny; ++i){
      A_host[i] = 0;
  }
  
  // __host__ __device__cudaError_t cudaMalloc ( void** devPtr, size_t size ) 
  // devPtr - Pointer to allocated device memory
  // size - Requested allocation size in bytes 
  // __host__ is a function type qualifier that indicates that a function 
  // should be compiled for the host (CPU)
  // __device__ is a function type qualifier that indicates 
  // that a function should be compiled for the device (GPU).
  // cudaError_t is an enumerated type that represents the error codes 
  // returned by CUDA Runtime API functions.

  int *A_dev=NULL;
  cudaMalloc((void**)&A_dev, nx*ny*sizeof(int));
  cudaMemcpy(A_dev, A_host, nx*ny*sizeof(int), cudaMemcpyHostToDevice);
  // __host__ cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) 
  // dst - Destination memory address 
  // src - Source memory address 
  // count - Size in bytes to copy 
  // kind - Type of transfer 

  dim3 block(4,2);
  // a two-dimensional block of threads with dimensions 4 and 2 in the x and y directions
  dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
  //  the dimensions of the grid of thread blocks.
  
  printCoordinate<<<grid,block>>>(A_dev,nx,ny);
  cudaDeviceSynchronize();
 
  // Free
  cudaFree(A_dev);
  free(A_host);
 
  cudaDeviceReset();
  return 0;
}

Overwriting coordinate.cu


In [108]:
# Compile the code. The flag is needed if you use the Tesla K80.  
!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 coordinate.cu -o coordinate



In [109]:
!./coordinate

thread_id(0,0) block_id(0,2) coordinate(0,4)global index 32 ival  0
thread_id(1,0) block_id(0,2) coordinate(1,4)global index 33 ival  0
thread_id(2,0) block_id(0,2) coordinate(2,4)global index 34 ival  0
thread_id(3,0) block_id(0,2) coordinate(3,4)global index 35 ival  0
thread_id(0,1) block_id(0,2) coordinate(0,5)global index 40 ival  0
thread_id(1,1) block_id(0,2) coordinate(1,5)global index 41 ival  0
thread_id(2,1) block_id(0,2) coordinate(2,5)global index 42 ival  0
thread_id(3,1) block_id(0,2) coordinate(3,5)global index 43 ival  0
thread_id(0,0) block_id(1,1) coordinate(4,2)global index 20 ival  0
thread_id(1,0) block_id(1,1) coordinate(5,2)global index 21 ival  0
thread_id(2,0) block_id(1,1) coordinate(6,2)global index 22 ival  0
thread_id(3,0) block_id(1,1) coordinate(7,2)global index 23 ival  0
thread_id(0,1) block_id(1,1) coordinate(4,3)global index 28 ival  0
thread_id(1,1) block_id(1,1) coordinate(5,3)global index 29 ival  0
thread_id(2,1) block_id(1,1) coordinate(6,3)glob