# PYCUDA: run on collab only
pycuda seems to work better with python 2.X but in Collab it is possible to install it with python 3. 

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/dtrad/geoml_course/blob/master/pycudaexamples.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

Environment in Collab

In [1]:
!env 

NV_LIBCUBLAS_DEV_VERSION=11.3.0.106-1
NV_CUDA_COMPAT_PACKAGE=cuda-compat-11-1
__EGL_VENDOR_LIBRARY_DIRS=/usr/lib64-nvidia:/usr/share/glvnd/egl_vendor.d/
NV_CUDNN_PACKAGE_DEV=libcudnn8-dev=8.0.5.39-1+cuda11.1
PYDEVD_USE_FRAME_EVAL=NO
LD_LIBRARY_PATH=/usr/lib64-nvidia
NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.8.4-1+cuda11.1
CLOUDSDK_PYTHON=python3
LANG=en_US.UTF-8
NV_LIBNPP_DEV_PACKAGE=libnpp-dev-11-1=11.1.2.301-1
ENABLE_DIRECTORYPREFETCHER=1
HOSTNAME=a3eefa19291f
OLDPWD=/
CLOUDSDK_CONFIG=/content/.config
USE_AUTH_EPHEM=1
NV_LIBNPP_VERSION=11.1.2.301-1
NV_NVPROF_DEV_PACKAGE=cuda-nvprof-11-1=11.1.105-1
NVIDIA_VISIBLE_DEVICES=all
NV_NVPROF_VERSION=11.1.105-1
NV_LIBCUSPARSE_VERSION=11.3.0.10-1
DATALAB_SETTINGS_OVERRIDES={"kernelManagerProxyPort":6000,"kernelManagerProxyHost":"172.28.0.3","jupyterArgs":["--ip=172.28.0.2"],"debugAdapterMultiplexerPath":"/usr/local/bin/dap_multiplexer","enableLsp":true}
NV_LIBCUBLAS_DEV_PACKAGE=libcublas-dev-11-1=11.3.0.106-1
ENV=/root/.bashrc
PAGER=cat
NCCL_VERSIO

In [2]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 27.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2022.1.12.tar.gz (70 kB)
[K     |████████████████████████████████| 70 kB 9.5 MB/s 
[?25hCollecting mako
  Downloading Mako-1.2.2-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 8.5 MB/s 
Collecting platformdirs>=2.2.0
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2022.1-cp37-cp37m-linux_x86_64.whl size=629484 sha256=ea32490570dce2b1df6ee703aa83984e9d1df65d53fd

In [3]:
!echo $HOME


/root


In [4]:
!pip config list

global.disable-pip-version-check='True'
global.extra-index-url='https://us-python.pkg.dev/colab-wheels/public/simple/'
global.log='/var/log/pip.log'
list.format='columns'


In [5]:
import pycuda
import pycuda.driver as drv
drv.init()

In [6]:
print('CUDA device query (PyCUDA version) \n')
print('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count()))

CUDA device query (PyCUDA version) 

Detected 1 CUDA Capable device(s) 



In [7]:
for i in range(drv.Device.count()):
    gpu_device = drv.Device(i)
    print(gpu_device)
    print( 'Device {}: {}'.format( i, gpu_device.name() ) )
    compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
    print( '\t Compute Capability: {}'.format(compute_capability))
    print( '\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))
    
    

<pycuda._driver.Device object at 0x7fb62bb99cb0>
Device 0: Tesla T4
	 Compute Capability: 7.5
	 Total Memory: 15109 megabytes


In [8]:
import pycuda.autoinit
from pycuda import gpuarray
from time import time
from pycuda.elementwise import ElementwiseKernel 
import numpy as np  
    

In [9]:
host_data = np.float32( np.random.random(50000000) )

gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_ker")

In [10]:
def speedcomparison():
    t1 = time()
    host_data_2x =  host_data * np.float32(2)
    t2 = time()
    print('total time to compute on CPU: %f' % (t2 - t1))
    device_data = gpuarray.to_gpu(host_data)
    # allocate memory for output
    device_data_2x = gpuarray.empty_like(device_data)
    t1 = time()
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
    from_device = device_data_2x.get()
    print('total time to compute on GPU: %f' % (t2 - t1))
    print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) ))
    

In [11]:
# First time GPU is slow because it needs to compile the code.
speedcomparison()

total time to compute on CPU: 0.036922
total time to compute on GPU: 0.918199
Is the host computation the same as the GPU computation? : True


In [12]:
speedcomparison()

total time to compute on CPU: 0.037397
total time to compute on GPU: 0.000086
Is the host computation the same as the GPU computation? : True
