<a href="https://colab.research.google.com/github/ezippo/cmepda-EZ/blob/master/notebook_pycuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup iniziale

1. attivare supporto iniziale
2. installa pycuda



In [None]:
import pycuda

In [None]:
!pip install pycuda

3. controlla versione cuda


In [3]:
import pycuda

In [4]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


# BASH

In [5]:
!ls


sample_data


In [6]:
mkdir testdir


In [7]:
cd mkdir


[Errno 2] No such file or directory: 'mkdir'
/content


In [8]:
cd testdir/


/content/testdir


In [9]:
ls

In [10]:
touch ciao


SyntaxError: ignored

In [11]:

!gcc --version


gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



# Caratteristiche GPU


In [None]:
!nvidia-smi

In [13]:
import pycuda.driver as drv


In [16]:
drv.init()
drv.get_version()
devn=drv.Device.count()
print("N GPU ="+str(devn))
devices = []
for i in range (devn):
  devices.append(drv.Device(i))
for sp in devices:
  print("GPU name: "+str(sp.name()))
  print("Compute Capability ="+str(sp.compute_capability()))
  print("Total Memory = "+str(sp.total_memory()/(2.**20))+"MB")

N GPU =1
GPU name: Tesla P4
Compute Capability =(6, 1)
Total Memory = 7611.9375MB


In [15]:
print(devn)

1


# Esempio in C

In [38]:
%%writefile VecAdd.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void vectorAdd(const float *A,const float *B,float *C,int NumElements) {
    int i=threadIdx.x + blockIdx.x*blockDim.x;
    if (i<NumElements) {
        C[i] = A[i] + B[i];
    }
}

//HOST
int main(void) {
    int NumElements=15;
    size_t size = NumElements*sizeof(float);
    float a[NumElements], b[NumElements], c[NumElements];
    float *a_gpu, *b_gpu, *c_gpu;
    cudaMalloc((void **)&a_gpu,size);
    cudaMalloc((void **)&b_gpu,size);
    cudaMalloc((void **)&c_gpu,size);

    for(int i=0; i<NumElements; i++) {
        a[i]=i*i;
        b[i]=i;
    }

    printf("Copy input data from host to CUDA device \n");
    cudaMemcpy(a_gpu,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(b_gpu,b,size,cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blockPerGrid = (NumElements+threadsPerBlock-1)/threadsPerBlock;
    vectorAdd<<<blockPerGrid,threadsPerBlock>>>(a_gpu, b_gpu, c_gpu, NumElements);

    printf("Copy output data from CUDA device to host");
    cudaMemcpy(c,c_gpu,size,cudaMemcpyDeviceToHost);

    for(int i=0; i<NumElements; i++) {
        printf("%f \n",c[i]);
    }

    cudaFree(a_gpu); cudaFree(b_gpu); cudaFree(c_gpu);
    return 0;
}

Overwriting VecAdd.cu


In [39]:
!nvcc -o VecAdd VecAdd.cu

In [31]:
ls

[0m[01;32mVecAdd[0m*  VecAdd.cu


In [40]:
!./VecAdd

Copy input data from host to CUDA device 
Copy output data from CUDA device to host0.000000 
2.000000 
6.000000 
12.000000 
20.000000 
30.000000 
42.000000 
56.000000 
72.000000 
90.000000 
110.000000 
132.000000 
156.000000 
182.000000 
210.000000 


# implementazione con pycuda

In [41]:
from pycuda import autoinit

In [42]:
from pycuda import gpuarray

In [43]:
import numpy as np

In [44]:
aux = range(15);
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)
print(b)

[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81. 100. 121. 144. 169.
 196.]


In [45]:
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)


metodo semplice

In [46]:
c_gpu = a_gpu + b_gpu

In [47]:
c_gpu

array([  0.,   2.,   6.,  12.,  20.,  30.,  42.,  56.,  72.,  90., 110.,
       132., 156., 182., 210.], dtype=float32)

secondo metodo

In [48]:
from pycuda.elementwise import ElementwiseKernel

In [49]:
myCudaFunc = ElementwiseKernel(arguments = "float *a, float *b, float *c", operation= "c[i] = a[i] + b[i]", name = "mySumK")

In [50]:
myCudaFunc(a_gpu,b_gpu,c_gpu)

In [51]:
c_gpu

array([  0.,   2.,   6.,  12.,  20.,  30.,  42.,  56.,  72.,  90., 110.,
       132., 156., 182., 210.], dtype=float32)

terzo modulo

In [52]:
from pycuda.compiler import SourceModule

In [54]:
cudaCode = open("VecAdd.cu","r")
myCUDACode = cudaCode.read()

In [None]:
myCUDACode

In [56]:
myCode = SourceModule(myCUDACode)



  """Entry point for launching an IPython kernel.


In [57]:
importedKernel = myCode.get_function("vectorAdd")

In [58]:
nTxB = 256
nBxG = 1
nGxB =1

In [61]:
c_gpu.set(c)

In [62]:
a_gpu.gpudata

<pycuda._driver.DeviceAllocation at 0x7f950e309d50>

In [64]:
importedKernel(a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, block=(nTxB,nBxG,nGxB))

# SOMMA DI MATRICI