<a href="https://colab.research.google.com/github/chiyanglin-AStar/2025_physics_note/blob/main/02_Pycuda_ex6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is reference from [Linking Python to CUDA with PyCUDA: A Beginner’s Guide](https://medium.com/@mahmoudalyosify/linking-python-to-cuda-with-pycuda-a-beginners-guide-d128da0ed460)

## PyCUDA ref:

[PyCUDA Tutorial(翻譯)](https://hackmd.io/@shaoeChen/SkbmZOXbB/https%3A%2F%2Fhackmd.io%2F%40shaoeChen%2FSkKb0fX-H)

[pycuda tutorial](https://documen.tician.de/pycuda/tutorial.html)

[PyCUDA Tutorial Introduction](https://github.com/berlinguyinca/pycuda/blob/master/doc/source/tutorial.rst)

[GPU程式設計(5) -- Python](https://ithelp.ithome.com.tw/articles/10283144)

In [4]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.2.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m66.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.21-py3-none-any.whl.metadata (2.9 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading pytools-2024.1.21-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.4/92.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Mako-1.3.8

## GPU Demo Example:
from [pycuda demo](https://github.com/inducer/pycuda/blob/main/examples/demo.py)

##### Sample source code from the Tutorial Introduction in the documentation.

In [5]:
import pycuda.driver as cuda
import pycuda.autoinit  # noqa
from pycuda.compiler import SourceModule

In [6]:
import numpy

In [7]:
a = numpy.random.randn(4, 4)

a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    __global__ void doublify(float *a)
    {
      int idx = threadIdx.x + threadIdx.y*4;
      a[idx] *= 2;
    }
    """)

func = mod.get_function("doublify")
func(a_gpu, block=(4, 4, 1), grid=(1, 1), shared=0)

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print("original array:")
print(a)
print("doubled with kernel:")
print(a_doubled)

original array:
[[-1.620548    0.13879654  0.96574795  1.1577861 ]
 [ 0.04036034  0.42371148  0.7621992  -2.2119262 ]
 [ 1.2848102   0.33688763  1.0071028   0.3925474 ]
 [-0.5067073  -0.34551117  0.70004815 -0.9857577 ]]
doubled with kernel:
[[-3.241096    0.27759308  1.9314959   2.3155723 ]
 [ 0.08072069  0.84742296  1.5243984  -4.4238524 ]
 [ 2.5696204   0.67377526  2.0142057   0.7850948 ]
 [-1.0134146  -0.69102234  1.4000963  -1.9715154 ]]


#### alternate kernel invocation -------------------------------------------------

In [8]:
func(cuda.InOut(a), block=(4, 4, 1))
print("doubled with InOut:")
print(a)

doubled with InOut:
[[-3.241096    0.27759308  1.9314959   2.3155723 ]
 [ 0.08072069  0.84742296  1.5243984  -4.4238524 ]
 [ 2.5696204   0.67377526  2.0142057   0.7850948 ]
 [-1.0134146  -0.69102234  1.4000963  -1.9715154 ]]


## Dump Device Properties
from [URL](https://github.com/inducer/pycuda/blob/main/examples/dump_properties.py)

In [10]:
import pycuda.driver as drv

drv.init()
print("%d device(s) found." % drv.Device.count())

1 device(s) found.


In [11]:
## List device capability
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    print("Device #%d: %s" % (ordinal, dev.name()))
    print("  Compute Capability: %d.%d" % dev.compute_capability())
    print("  Total Memory: %s KB" % (dev.total_memory()//(1024)))
    atts = [(str(att), value)
            for att, value in list(dev.get_attributes().items())]
    atts.sort()

    for att, value in atts:
        print(f"  {att}: {value}")

Device #0: Tesla T4
  Compute Capability: 7.5
  Total Memory: 15464512 KB
  ASYNC_ENGINE_COUNT: 3
  CAN_MAP_HOST_MEMORY: 1
  CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: 1
  CLOCK_RATE: 1590000
  COMPUTE_CAPABILITY_MAJOR: 7
  COMPUTE_CAPABILITY_MINOR: 5
  COMPUTE_MODE: DEFAULT
  COMPUTE_PREEMPTION_SUPPORTED: 1
  CONCURRENT_KERNELS: 1
  CONCURRENT_MANAGED_ACCESS: 1
  DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: 0
  ECC_ENABLED: 1
  GENERIC_COMPRESSION_SUPPORTED: 0
  GLOBAL_L1_CACHE_SUPPORTED: 1
  GLOBAL_MEMORY_BUS_WIDTH: 256
  GPU_OVERLAP: 1
  HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: 1
  HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: 0
  HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: 0
  HOST_NATIVE_ATOMIC_SUPPORTED: 0
  INTEGRATED: 0
  KERNEL_EXEC_TIMEOUT: 0
  L2_CACHE_SIZE: 4194304
  LOCAL_L1_CACHE_SUPPORTED: 1
  MANAGED_MEMORY: 1
  MAXIMUM_SURFACE1D_LAYERED_LAYERS: 2048
  MAXIMUM_SURFACE1D_LAYERED_WIDTH: 32768
  MAXIMUM_SURFACE1D_WIDTH: 32768
  MAXIMUM_SURFACE2D_HEIGHT: 65536
  MAXIMUM_SURFACE2D_LAYERED_HEIGH