# PYCUDA: run on collab only
pycuda seems to work better with python 2.X but in Collab it is possible to install it with python 3. 

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/dtrad/geoml_course/blob/master/pycudaexamples.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

Environment in Collab

In [1]:
!env 

CUDNN_VERSION=8.0.4.30
__EGL_VENDOR_LIBRARY_DIRS=/usr/lib64-nvidia:/usr/share/glvnd/egl_vendor.d/
PYDEVD_USE_FRAME_EVAL=NO
LD_LIBRARY_PATH=/usr/lib64-nvidia
CLOUDSDK_PYTHON=python3
LANG=en_US.UTF-8
HOSTNAME=cea6e5a562bc
OLDPWD=/
CLOUDSDK_CONFIG=/content/.config
NVIDIA_VISIBLE_DEVICES=all
DATALAB_SETTINGS_OVERRIDES={"kernelManagerProxyPort":6000,"kernelManagerProxyHost":"172.28.0.3","jupyterArgs":["--ip=\"172.28.0.2\""],"debugAdapterMultiplexerPath":"/usr/local/bin/dap_multiplexer","enableLsp":true}
ENV=/root/.bashrc
PAGER=cat
NCCL_VERSION=2.7.8
TF_FORCE_GPU_ALLOW_GROWTH=true
JPY_PARENT_PID=52
NO_GCE_CHECK=True
PWD=/content
HOME=/root
LAST_FORCED_REBUILD=20210714
CLICOLOR=1
DEBIAN_FRONTEND=noninteractive
LIBRARY_PATH=/usr/local/cuda/lib64/stubs
GCE_METADATA_TIMEOUT=0
GLIBCPP_FORCE_NEW=1
TBE_CREDS_ADDR=172.28.0.1:8008
TERM=xterm-color
SHELL=/bin/bash
GCS_READ_CACHE_BLOCK_SIZE_MB=16
MPLBACKEND=module://ipykernel.pylab.backend_inline
CUDA_VERSION=11.0.3
NVIDIA_DRIVER_CAPABILITIES=compute,u

In [2]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 7.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2021.2.7.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 2.6 MB/s 
[?25hCollecting mako
  Downloading Mako-1.1.4-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 5.3 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=627882 sha256=90a41d3bcad6ea244f692bc3c884cc032d1b3cfd3dd596695638037f2b170bc7
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdo

In [3]:
!echo $HOME


/root


In [4]:
!pip config list

global.disable-pip-version-check='True'
global.log='/var/log/pip.log'
list.format='columns'


In [5]:
import pycuda
import pycuda.driver as drv
drv.init()

In [6]:
print('CUDA device query (PyCUDA version) \n')
print('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count()))

CUDA device query (PyCUDA version) 

Detected 1 CUDA Capable device(s) 



In [7]:
for i in range(drv.Device.count()):
    gpu_device = drv.Device(i)
    print(gpu_device)
    print( 'Device {}: {}'.format( i, gpu_device.name() ) )
    compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
    print( '\t Compute Capability: {}'.format(compute_capability))
    print( '\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))
    
    

<pycuda._driver.Device object at 0x7f6629ccf370>
Device 0: Tesla T4
	 Compute Capability: 7.5
	 Total Memory: 15109 megabytes


In [8]:
import pycuda.autoinit
from pycuda import gpuarray
from time import time
from pycuda.elementwise import ElementwiseKernel 
import numpy as np  
    

In [9]:
host_data = np.float32( np.random.random(50000000) )

gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_ker")

In [10]:
def speedcomparison():
    t1 = time()
    host_data_2x =  host_data * np.float32(2)
    t2 = time()
    print('total time to compute on CPU: %f' % (t2 - t1))
    device_data = gpuarray.to_gpu(host_data)
    # allocate memory for output
    device_data_2x = gpuarray.empty_like(device_data)
    t1 = time()
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
    from_device = device_data_2x.get()
    print('total time to compute on GPU: %f' % (t2 - t1))
    print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) ))
    

In [11]:
# First time GPU is slow because it needs to compile the code.
speedcomparison()

total time to compute on CPU: 0.040803
total time to compute on GPU: 1.106045
Is the host computation the same as the GPU computation? : True


In [12]:
speedcomparison()

total time to compute on CPU: 0.038579
total time to compute on GPU: 0.000106
Is the host computation the same as the GPU computation? : True
