In [10]:
!echo NVIDIA CUDA AND DRIVES VERIFICATION
%cd /usr/local/cuda/samples/1_Utilities/deviceQuery/
!ls
!make
!./deviceQuery
!nvcc --version
%cat /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery.cpp

NVIDIA CUDA AND DRIVES VERIFICATION
/usr/local/cuda-10.1/samples/1_Utilities/deviceQuery
deviceQuery	 deviceQuery.o	NsightEclipse.xml  src
deviceQuery.cpp  Makefile	readme.txt
make: Nothing to be done for 'all'.
./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "Tesla T4"
  CUDA Driver Version / Runtime Version          10.1 / 10.1
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 15080 MBytes (15812263936 bytes)
  (40) Multiprocessors, ( 64) CUDA Cores/MP:     2560 CUDA Cores
  GPU Max Clock rate:                            1590 MHz (1.59 GHz)
  Memory Clock rate:                             5001 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Tex

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ktok621s
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ktok621s
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=4cf5bf958ae4b7aa8e4001eeeeda8917200a9ef5e6024466e3d446c8b40e3044
  Stored in directory: /tmp/pip-ephem-wheel-cache-wfbj_1xf/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /usr/local/cuda-10.1/samples/1_Utilities/deviceQuery/src
Out bin /usr/local/cuda-10.1/samples/1_Utilities/deviceQuery/result.out


In [11]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [12]:
%%cu
/****** calculate pi *******/
#include <stdio.h>
#include <math.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <memory>
#include <iostream>
#include <sys/time.h>






// CUDA-C includes
#include <cuda.h>

int BLOCKSPERGRID  = 40;
int NUMTHREADS = 10240; // (40 Multiprocessors * 64 CUDA Cores/MP) * 4 
#define ITERATIONS 2e09

/*****************************************************************************
/*kernel
*****************************************************************************/


__global__ void calculatePi(double *piTotal, long int iterations, int totalThreads)
{   long int initIteration, endIteration;
    long int i = 0;
    double piPartial;
    
    int index = (blockDim.x * blockIdx.x) + threadIdx.x;

    initIteration = (iterations/totalThreads) * index;
    endIteration = initIteration + (iterations/totalThreads) - 1;
    
    i = initIteration;
    piPartial = 0;
    
    do{
        piPartial = piPartial + (double)(4.0 / ((i*2)+1));
        i++;
        piPartial = piPartial - (double)(4.0 / ((i*2)+1));
        i++;
    }while(i < endIteration);

    piTotal[index] = piPartial;
    
    __syncthreads();
    if(index == 0){
        for(i = 1; i < totalThreads; i++)
            piTotal[0] = piTotal[0] + piTotal[i];
    }
}


/******************************************************************************/


int main()
{   
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
 
    struct timeval tval_before, tval_after, tval_result;

    gettimeofday(&tval_before, NULL);
 

    int blocksPerGrid, threadsPerBlock, i, size;
    long int iterations;
    int totalThreads;
    double *h_pitotal, *d_pitotal;
    
    blocksPerGrid = BLOCKSPERGRID;
    cudaError_t err = cudaSuccess;

    size = sizeof(double)*NUMTHREADS;
    h_pitotal = (double *)malloc(size);
    if ( h_pitotal == NULL){
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }
    
    for(i = 0; i < NUMTHREADS; i++)
        h_pitotal[i] = 0.0;

    err = cudaMalloc((void **)&d_pitotal, size);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    
    err = cudaMemcpy(d_pitotal, h_pitotal, sizeof(double)*NUMTHREADS, cudaMemcpyHostToDevice);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Lanzar KERNEL
    threadsPerBlock = NUMTHREADS/blocksPerGrid;
    totalThreads = blocksPerGrid * threadsPerBlock;
    iterations = ITERATIONS;
    printf("CUDA kernel launch with %d blocks of %d threads Total: %i       ", blocksPerGrid, threadsPerBlock, totalThreads  );
    calculatePi<<<blocksPerGrid, threadsPerBlock>>>(d_pitotal, iterations, totalThreads);
    err = cudaGetLastError();
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(h_pitotal, d_pitotal, size, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_pitotal);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Calculated pi: %.12f", *h_pitotal);
    // Free host memory

    free(h_pitotal);
    err = cudaDeviceReset();
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
 
    gettimeofday(&tval_after, NULL);

    timersub(&tval_after, &tval_before, &tval_result);

    printf("\nTime elapsed: %ld.%06ld\n", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);
    return 0;
}

CUDA kernel launch with 40 blocks of 256 threads Total: 10240       Calculated pi: 3.141592636327
Time elapsed: 0.411239

