In [None]:
!apt update

In [None]:
!apt search cuda

In [1]:
!apt install cuda-10-2

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  cuda-command-line-tools-10-2 cuda-compiler-10-2 cuda-cudart-10-2
  cuda-cudart-dev-10-2 cuda-cufft-10-2 cuda-cufft-dev-10-2 cuda-cuobjdump-10-2
  cuda-cupti-10-2 cuda-cupti-dev-10-2 cuda-curand-10-2 cuda-curand-dev-10-2
  cuda-cusolver-10-2 cuda-cusolver-dev-10-2 cuda-cusparse-10-2
  cuda-cusparse-dev-10-2 cuda-demo-suite-10-2 cuda-documentation-10-2
  cuda-driver-dev-10-2 cuda-gdb-10-2 cuda-libraries-10-2
  cuda-libraries-dev-10-2 cuda-license-10-2 cuda-memcheck-10-2
  cuda-misc-headers-10-2 cuda-npp-10-2 cuda-npp-dev-10-2 cuda-nsight-10-2
  cuda-nsight-compute-10-2 cuda-nsight-systems-10-2 cuda-nvcc-10-2
  cuda-nvdisasm-10-2 cuda-nvgraph-10-2 cuda-nvgraph-dev-10-2 cuda-nvjpeg-10-2
  cuda-nvjpeg-dev-10-2 cuda-nvml-dev-10-2 cuda-nvprof-10-2 cuda-nvprune-10-2
  cuda-nvrtc-10-2 cuda-nvrtc-dev-10-2 cuda-nvtx-10-2 cuda-nvvp-10-2
  cuda-runti

In [2]:
!nvidia-smi

Fri Oct 22 23:14:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!man nvidia-smi

nvidia-smi(1)                       NVIDIA                       nvidia-smi(1)

NNAAMMEE
       nvidia-smi - NVIDIA System Management Interface program

SSYYNNOOPPSSIISS
       nvidia-smi [OPTION1 [ARG1]] [OPTION2 [ARG2]] ...

DDEESSCCRRIIPPTTIIOONN
       nvidia-smi (also NVSMI) provides monitoring and management capabilities
       for each of NVIDIA's Tesla, Quadro, GRID and GeForce devices from Fermi
       and higher architecture families. GeForce Titan series devices are sup‐
       ported for most functions with very limited  information  provided  for
       the  remainder  of  the  Geforce brand.  NVSMI is a cross platform tool
       that supports all standard NVIDIA driver-supported  Linux  distros,  as
       well as 64bit versions of Windows starting with Windows Server 2008 R2.
       Metrics can be consumed directly by users via stdout,  or  provided  by
       file via CSV and XML formats for scripting purposes.

       Note  that much of the func

In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:24:38_PDT_2019
Cuda compilation tools, release 10.2, V10.2.89


In [6]:
%%writefile add.cpp
#include <iostream>
#include <math.h>

// function to add the elements of two arrays
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20; // 1M elements

  float *x = new float[N];
  float *y = new float[N];

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the CPU
  add(N, x, y);

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  return 0;
}

Writing add.cpp


In [7]:
!ls -l

total 8
-rw-r--r-- 1 root root  730 Oct 22 23:18 add.cpp
drwxr-xr-x 1 root root 4096 Oct  8 13:45 sample_data


In [8]:
!nvcc -o add add.cpp

In [11]:
!time ./add

Max error: 0

real	0m0.030s
user	0m0.023s
sys	0m0.006s


In [14]:
%%writefile add.cu
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Overwriting add.cu


In [15]:
!ls -l

total 656
-rwxr-xr-x 1 root root 656648 Oct 22 23:18 add
-rw-r--r-- 1 root root    730 Oct 22 23:18 add.cpp
-rw-r--r-- 1 root root    929 Oct 22 23:24 add.cu
drwxr-xr-x 1 root root   4096 Oct  8 13:45 sample_data


In [16]:
!nvcc -o add_cuda add.cu

In [17]:
!nvprof ./add_cuda

==935== NVPROF is profiling process 935, command: ./add_cuda
Max error: 0
==935== Profiling application: ./add_cuda
==935== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  424.68ms         1  424.68ms  424.68ms  424.68ms  add(int, float*, float*)
      API calls:   57.23%  424.71ms         1  424.71ms  424.71ms  424.71ms  cudaDeviceSynchronize
                   42.17%  312.97ms         2  156.48ms  774.44us  312.19ms  cudaMallocManaged
                    0.23%  1.6865ms         1  1.6865ms  1.6865ms  1.6865ms  cudaLaunchKernel
                    0.15%  1.0966ms         2  548.30us  510.19us  586.41us  cudaFree
                    0.11%  783.12us         1  783.12us  783.12us  783.12us  cuDeviceGetPCIBusId
                    0.08%  563.08us         1  563.08us  563.08us  563.08us  cuDeviceTotalMem
                    0.03%  244.75us        97  2.5230us     162ns  112.69us  cuDeviceGetAttribute
           

In [19]:
!time ./add_cuda

Max error: 0

real	0m0.618s
user	0m0.438s
sys	0m0.160s


In [20]:
%%writefile add_block.cu
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 256>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Writing add_block.cu


In [21]:
!nvcc -o add_block add_block.cu

In [23]:
!nvprof ./add_block

==1039== NVPROF is profiling process 1039, command: ./add_block
Max error: 0
==1039== Profiling application: ./add_block
==1039== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.3551ms         1  2.3551ms  2.3551ms  2.3551ms  add(int, float*, float*)
      API calls:   97.49%  208.68ms         2  104.34ms  682.40us  208.00ms  cudaMallocManaged
                    1.11%  2.3778ms         1  2.3778ms  2.3778ms  2.3778ms  cudaDeviceSynchronize
                    0.64%  1.3728ms         1  1.3728ms  1.3728ms  1.3728ms  cudaLaunchKernel
                    0.45%  955.55us         2  477.78us  420.52us  535.03us  cudaFree
                    0.22%  467.60us         1  467.60us  467.60us  467.60us  cuDeviceTotalMem
                    0.08%  170.60us        97  1.7580us     152ns  70.736us  cuDeviceGetAttribute
                    0.01%  25.251us         1  25.251us  25.251us  25.251us  cuDeviceGetName
         

In [24]:
!time ./add_block

Max error: 0

real	0m0.179s
user	0m0.025s
sys	0m0.144s


In [None]:
!time ./add_cuda

In [25]:
%%writefile add2.cu
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Writing add2.cu


In [26]:
!nvcc -o add2 add2.cu

In [27]:
!nvprof ./add2

==1111== NVPROF is profiling process 1111, command: ./add2
Max error: 0
==1111== Profiling application: ./add2
==1111== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  477.82ms         1  477.82ms  477.82ms  477.82ms  add(int, float*, float*)
      API calls:   68.07%  477.85ms         1  477.85ms  477.85ms  477.85ms  cudaDeviceSynchronize
                   31.42%  220.60ms         2  110.30ms  759.12us  219.84ms  cudaMallocManaged
                    0.25%  1.7273ms         1  1.7273ms  1.7273ms  1.7273ms  cudaLaunchKernel
                    0.14%  990.62us         2  495.31us  484.85us  505.77us  cudaFree
                    0.08%  580.70us         1  580.70us  580.70us  580.70us  cuDeviceTotalMem
                    0.03%  238.90us        97  2.4620us     160ns  112.89us  cuDeviceGetAttribute
                    0.00%  28.816us         1  28.816us  28.816us  28.816us  cuDeviceGetName
                   

In [28]:
!time ./add2

Max error: 0

real	0m0.672s
user	0m0.516s
sys	0m0.146s
