<a href="https://colab.research.google.com/github/christianmora-star/ejemplo_repositorio/blob/main/ECU1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK =1


In [3]:
# Christian Ulises Mora Figueroa
# CUDA Steps:
# 1. Initialize data from CPU
# 2. Transfer from CPU to GPU
# 3. Run kernel with defined Grid/Block size (Threads)
# 4. Transfer results from GPU to CPU
# 5. Clear memory

# 1. CUDA kernel
@cuda.jit
def first_kernel(a, result):
  idx = cuda.grid(1)
  if idx < a.size:
    result[idx] = a[idx]

# HOST
def main():
  #2. Initialize data on CPU
  N = 10_000_000
  a_cpu = np.arange(N, dtype = np.float32)

  #-----------------------------------------
  #CPU computation
  #-----------------------------------------
  start = time.time()
  result_cpu = a_cpu
  cpu_time = time.time() - start
  print(f"CPU Time: {cpu_time * 1e3:.2f}")

  #-----------------------------------------
  #GPU computation
  #-----------------------------------------
  #2.- Transfer from CPU to GPU
  start = time.time()
  a_gpu = cuda.to_device(a_cpu)
  result_gpu = cuda.device_array_like(a_cpu) # reserve memory
  transfer_in_time = time.time() - start

  # Kernel launch
  threads_per_block = 128
  blocks_per_grid = (N + threads_per_block -1) //threads_per_block # (10_000_000 + 127) // 128 = 78,125 blocks
  start = time.time()
  first_kernel[blocks_per_grid, threads_per_block](a_cpu, result_gpu) # lunch karnel
  cuda.synchronize()
  kernel_time = time.time() - start

  #Copy back
  start = time.time()
  result_from_gpu = result_gpu.copy_to_host()
  cuda.synchronize()
  transfer_out_time = time.time() - start

  #Report
  print(f"GPU transfer to device: {transfer_in_time * 1e3:.2f} ms")
  print(f"GPU kernel execution: {kernel_time * 1e3:.2f} ms")
  print(f"GPU transfer to host: {transfer_out_time * 1e3:.2f} ms")
  print(f"Total GPU time: {transfer_in_time + kernel_time + transfer_out_time * 1e3:.2f} ms")

  #cleanup
  del a_gpu, result_gpu
  cuda.close()

if __name__ == "__main__":
  main()

CPU Time: 0.00
GPU transfer to device: 120.77 ms
GPU kernel execution: 92.79 ms
GPU transfer to host: 16.21 ms
Total GPU time: 16.43 ms


