In [1]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'nvidia-kernel-common-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-325-updates' for glob 'nvidia*'
Note, selecting 'nvidia-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-binary' for glob 'nvidia*'
Note, selecting 'nvidia-331-dev' for glob 'nvidia*'
Note, selecting 'nvidia-304-updates-dev' for glob 'nvidia*'
Note, selecting 'nvidia-compute-utils-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-384-dev' for glob 'nvidia*'
Note, selecting 'nvidia-libopencl1-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-340-updates-uvm' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-common' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-source-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-331-updates-uvm' for glob 'nvidi

In [1]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

--2021-04-18 16:38:17--  https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Resolving developer.nvidia.com (developer.nvidia.com)... 152.199.0.24
Connecting to developer.nvidia.com (developer.nvidia.com)|152.199.0.24|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 [following]
--2021-04-18 16:38:17--  https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Reusing existing connection to developer.nvidia.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://developer.download.nvidia.com/compute/cuda/9.2/secure/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb?hcIWXVbneYpvZB--dW6BszIKGnW3Luj7m251V8aGuXFzf7abXybtx50l4CFZNh83C-vxCNBJNHir0Z82DAgFWiQ-aOra5jioKc5b_uQx1MEyhhYB6

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [3]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-8fie4gtm
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-8fie4gtm
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=36448ef2aeaf77820ea8374723b2b2494ca96638655bae0f7108ed58d91d61cd
  Stored in directory: /tmp/pip-ephem-wheel-cache-humkk5e9/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [5]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>

const float PI = 3.1415926535897932;
const long STEP_NUM = 1070596096;
const float STEP_LENGTH = 1.0 / 1070596096;
const int THREAD_NUM = 512;
const int BLOCK_NUM = 64;

__global__ void integrate(float *globalSum, int stepNum, float stepLength, int threadNum, int blockNum)
{
  int globalThreadId = threadIdx.x + blockIdx.x * blockDim.x;
  int start = (stepNum / (blockNum * threadNum)) * globalThreadId;
  int end = (stepNum / (blockNum * threadNum)) * (globalThreadId + 1);
  int localThreadId = threadIdx.x;
  int blockId = blockIdx.x;

  // shared memory to hold the sum for each block
  __shared__ float blockSum[THREAD_NUM];

  memset(blockSum, 0, threadNum * sizeof(float));

  float x;
  for (int i = start; i < end; i ++)
  {
    x = (i + 0.5f) * stepLength;
    blockSum[localThreadId] += 1.0f / (1.0f+ x * x);
  }
  blockSum[localThreadId] *= stepLength * 4;

  // wait for all threads to catch up
  __syncthreads();

  // for each block, do sum using shared memory
  for (int i = blockDim.x / 2; i > 0; i >>= 1)
  {
    if (localThreadId < i)
      blockSum[localThreadId] += blockSum[localThreadId + i];

    __syncthreads();
  }

  // sum up the summation of the block and write to the global sum
  if(localThreadId == 0)
    globalSum[blockId] = blockSum[0];
}

// parallel reduction to speedup summation which can only be performed inside a block
__global__ void sumReduce(float *sum, float *sumArray, int arraySize)
{
  int localThreadId = threadIdx.x;
  for (int i = blockDim.x / 2; i > 0; i >>= 1)
  {
    if (localThreadId < i)
      sumArray[localThreadId] += sumArray[localThreadId + i];

    __syncthreads();
  }

  if(localThreadId == 0)
    *sum = sumArray[0];
}

int main()
{
  int deviceCount = 0;

  printf("\nConfiguring device...\n");

  cudaError_t error = cudaGetDeviceCount(&deviceCount);

  if (error != cudaSuccess)
  {
    printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error, cudaGetErrorString(error));
    return 1;
  }

  if(deviceCount == 0)
  {
    printf("There are no available CUDA device(s)\n");
    return 1;
  }
  else
    printf("%d CUDA Capable device(s) detected\n", deviceCount);

  float pi = 0.0;
  float *deviceBlockSum;
  float *devicePi;

  // allocate memory on GPU
  cudaMalloc((void **) &devicePi, sizeof(float));
  cudaMalloc((void **) &deviceBlockSum, sizeof(float) * BLOCK_NUM);

  // Start timer
  cudaEvent_t startTime, stopTime;
  cudaEventCreate(&startTime);
  cudaEventCreate(&stopTime);
  cudaEventRecord(startTime, 0);
  printf("Start calculating in optimized kernel function...\n");
  integrate<<<BLOCK_NUM, THREAD_NUM>>>(deviceBlockSum, STEP_NUM, STEP_LENGTH, THREAD_NUM, BLOCK_NUM);
  sumReduce<<<1, BLOCK_NUM>>>(devicePi, deviceBlockSum, BLOCK_NUM);

  // retrieve result from device
  cudaMemcpy(&pi, devicePi, sizeof(float), cudaMemcpyDeviceToHost);

  cudaEventRecord(stopTime, 0);
  cudaEventSynchronize(stopTime);
  float optimizedGpuTime = 0;
  cudaEventElapsedTime(&optimizedGpuTime, startTime, stopTime);

  printf("PI = %.16lf with error %.16lf\nTime elapsed : %f seconds.\n\n", pi, fabs(pi - PI), optimizedGpuTime / 1000);
  assert(fabs(pi - PI) <= 0.001);

  // free memory
  cudaFree(deviceBlockSum);

  // reset Device
  cudaDeviceReset();
  return 0;
}



Configuring device...
1 CUDA Capable device(s) detected
Start calculating in optimized kernel function...
PI = 3.1416046619415283 with error 0.0000119209289551
Time elapsed : 0.009735 seconds.


