In [None]:
# 1D / 1 Block
#  doblar vector
#   Implementación con SourceModule de duplicar elementos de un vector.

!pip install pycuda

import pycuda.driver as drv
import pycuda.autoinit
from   pycuda.compiler import SourceModule
import numpy as np
import time


if __name__ == '__main__':

    # Parametros de configuracion:
    drv.init()
    dev = drv.Device(0) # Device 0
    max_thr_per_blk = dev.MAX_THREADS_PER_BLOCK
    print("MAX_THREADS_PER_BLOCK: ", max_thr_per_blk)

    # 1. Definir kenel
    mod = SourceModule ("""
    __global__ void doblar_vector(float *a) {
        int i = threadIdx.x;
        if (i < 8) {
            a[i] = a[i] * 2;
        }
    }
    """)

    # 2. Reserva memoria en GPU:
    SIZE = 8
    a = np.arange(SIZE, dtype=np.float32)
    print("Vector:", a)
    a_gpu = drv.mem_alloc(a.nbytes)

    # 3. Transferir datos host->GPU
    drv.memcpy_htod(a_gpu, a)

    # 4. Invoca kernel
    doblar = mod.get_function("doblar_vector")
    doblar(a_gpu, block = (SIZE, 1, 1), 
                  grid  = (   1, 1, 1) )

    # 5. Transferir datos GPU->host
    a_doubled = np.empty_like(a)
    drv.memcpy_dtoh(a_doubled, a_gpu)

    print("Vector doble:", a_doubled)


In [None]:
# 1D / N Blocks
#  dooblar vector
#   Implementación con SourceModule de duplicar elementos de un vector.

!pip install pycuda

import pycuda.driver as drv
import pycuda.autoinit
from   pycuda.compiler import SourceModule
import numpy as np
import time


if __name__ == '__main__':

    # Parametros de configuracion:
    drv.init()
    dev = drv.Device(0) # Device 0
    max_thr_per_blk = dev.MAX_THREADS_PER_BLOCK
    print("MAX_THREADS_PER_BLOCK: ", max_thr_per_blk)

    # 1. Definir kenel
    mod = SourceModule ("""
    __global__ void doblar_vector(float *a) {
        int i = (blockDim.x * blockIdx.x) + threadIdx.x;
        if (i < 2048) {
            a[i] = a[i] * 2;
        }
    }
    """)

    # 2. Reserva memoria en GPU:
    SIZE = 2048
    a = np.arange(SIZE, dtype=np.float32)
    print("Vector:", a)
    a_gpu = drv.mem_alloc(a.nbytes)

    # 3. Transferir datos host->GPU
    drv.memcpy_htod(a_gpu, a)

    # 4. Invoca kernel
    doblar = mod.get_function("doblar_vector")
    doblar(a_gpu, block = (max_thr_per_blk, 1, 1), 
                  grid  = (2, 1, 1))

    # 5. Transferir datos GPU->host
    a_doubled = np.empty_like(a)
    drv.memcpy_dtoh(a_doubled, a_gpu)

    print("Vector doble:", a_doubled)


In [None]:
#  2D / 1 Block
#   doblar matriz
#   Implementación con SourceModule de duplicar elementos de una matriz.


!pip install pycuda

import pycuda.driver as drv
import pycuda.autoinit
from   pycuda.compiler import SourceModule
import numpy as np
import time


if __name__ == '__main__':

    # Parametros de configuracion:
    drv.init()
    dev = drv.Device(0) # Device 0
    max_thr_per_blk = dev.MAX_THREADS_PER_BLOCK
    print("MAX_THREADS_PER_BLOCK: ", max_thr_per_blk)

    # 1. Definir kenel
    mod = SourceModule ("""
    __global__ void doblar_matriz(float *a) {
        int row = (blockDim.x * blockIdx.x) + threadIdx.x;
        int col = (blockDim.y * blockIdx.y) + threadIdx.y;

        if ((row < 5) && (col < 5)) {
            int i = (row * 5) + col;   /* Row major */ 
            a[i] = a[i] * 2;  
        }
    }
    """)

    # 2. Reserva memoria en GPU:
    H = 5
    W = 5
    a = np.arange(H * W, dtype=np.float32).reshape(H, W)
    print("Matriz:", a, a.nbytes)
    a_gpu = drv.mem_alloc(a.nbytes)

    # 3. Transferir datos host->GPU
    drv.memcpy_htod(a_gpu, a)

    # 4. Invoca kernel
    doblar = mod.get_function("doblar_matriz")
    doblar(a_gpu, block = (H, W, 1), 
                  grid  = (1, 1, 1))

    # 5. Transferir datos GPU->host
    a_doubled = np.arange(H * W, dtype=np.float32) 
    drv.memcpy_dtoh(a_doubled, a_gpu)

    print("Matriz doble:", a_doubled.reshape(H, W))


In [None]:
# 2D / N bloques
#  doblar matriz
#   Implementación con SourceModule de duplicar elementos de una matriz.

!pip install pycuda

import pycuda.driver as drv
import pycuda.autoinit
from   pycuda.compiler import SourceModule
import numpy as np
import time


if __name__ == '__main__':

    # 1. Definir kenel
    mod = SourceModule ("""
    __global__ void doblar_matriz(float *a) {
        int row = (blockDim.x * blockIdx.x) + threadIdx.x;
        int col = (blockDim.y * blockIdx.y) + threadIdx.y;

        if ((row < 64) && (col < 64)) {
            int i = (row * 64) + col; /* Row major */ 
            a[i] = a[i] * 2;  
        }
    }
    """)

    # 2. Reserva memoria en GPU:
    H = 64
    W = 64
    a = np.arange(H * W, dtype=np.float32).reshape(H, W)
    print("Matriz:", a)
    a_gpu = drv.mem_alloc(a.nbytes)

    # 3. Transferir datos host->GPU
    drv.memcpy_htod(a_gpu, a)

    # 4. Invoca kernel
    dim = 32       # Threads per block: 32 x 32. En total 1024 máximo.
    grid_x = 2     
    grid_y = 2
    doblar = mod.get_function("doblar_matriz")
    doblar(a_gpu, block = (dim, dim, 1),
                  grid  = (grid_x, grid_y, 1) )

    # 5. Transferir datos GPU->host
    a_doubled = np.arange(H * W, dtype=np.float32) 
    drv.memcpy_dtoh(a_doubled, a_gpu)

    print("Matriz doble:", a_doubled.reshape(H, W))


In [None]:
# 2D / N bloques (no multiplo)
#  doblar matriz
#   Implementación con SourceModule de duplicar elementos de una matriz.


!pip install pycuda

import pycuda.driver as drv
import pycuda.autoinit
from   pycuda.compiler import SourceModule
import numpy as np
import time
import math


if __name__ == '__main__':

    # 1. Definir kenel
    mod = SourceModule ("""
    __global__ void doblar_matriz(float *a, int rows, int cols) {
        int row = (blockDim.x * blockIdx.x) + threadIdx.x;
        int col = (blockDim.y * blockIdx.y) + threadIdx.y;

        if ((row < rows) && (col < cols)) {
            int i = (row * cols) + col;    /* Row major */ 
            a[i] = a[i] * 2;  
        }
    }
    """)

    # 2. Reserva memoria en GPU:
    H = 80    # Filas
    W = 74    # Columnas
    a = np.arange(H * W, dtype=np.float32).reshape(H, W)
    print("Matriz:", a)
    a_gpu = drv.mem_alloc(a.nbytes)

    # 3. Transferir datos host->GPU
    drv.memcpy_htod(a_gpu, a)

    # 4. Invoca kernel
    dim = 32                      # Block = 32 x 32 threeads
    grid_x = math.ceil(H / dim)   # Cuantos bloques X necesito
    grid_y = math.ceil(W / dim)   # Cuantos bloques Y necesito
    print("Numero de threads por block:", dim, dim)
    print("Numero de blocks por grid:", grid_x, grid_y)

    doblar = mod.get_function("doblar_matriz")
    doblar(a_gpu, np.intc(H), np.intc(W), 
           block = (dim, dim, 1),
           grid  = (grid_x, grid_y, 1) )

    # 5. Transferir datos GPU->host
    a_doubled = np.arange(H * W, dtype=np.float32) # np.empty_like(a)
    drv.memcpy_dtoh(a_doubled, a_gpu)

    print("Matriz doble:", a_doubled.reshape(H, W))
