In [5]:
from numba import cuda
import numpy as np
import math
import time

# CUDA kernel for matrix multiplication
@cuda.jit
def matrix_multiply(A, B, C):
    row, col = cuda.grid(2)  # Get row and column index of C
    if row < C.shape[0] and col < C.shape[1]:
        temp = 0.0
        for k in range(A.shape[1]):  # Perform dot product of A's row and B's column
            temp += A[row, k] * B[k, col]
        C[row, col] = temp

# Increase matrix size to stress the GPU
N = 8192  # Use a larger matrix size to increase GPU workload
A = np.random.rand(N, N).astype(np.float32)
B = np.random.rand(N, N).astype(np.float32)
C = np.zeros((N, N), dtype=np.float32)

# Transfer data to GPU
A_gpu = cuda.to_device(A)
B_gpu = cuda.to_device(B)
C_gpu = cuda.device_array((N, N), dtype=np.float32)

# Define grid and block size
threads_per_block = (16, 16)  # 16x16 threads per block
blocks_per_grid_x = math.ceil(A.shape[0] / threads_per_block[0])
blocks_per_grid_y = math.ceil(B.shape[1] / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

# Measure time for matrix multiplication on the GPU
start = time.time()

# Launch kernel
matrix_multiply[blocks_per_grid, threads_per_block](A_gpu, B_gpu, C_gpu)

# Synchronize to wait for GPU to finish processing before moving forward
cuda.synchronize()

# Copy result back to CPU
C_result = C_gpu.copy_to_host()

end = time.time()
print(f"Matrix multiplication on GPU completed in {end - start:.4f} seconds.")

# Measure CPU-based matrix multiplication for comparison
start_cpu = time.time()
C_cpu = np.dot(A, B)
end_cpu = time.time()

print(f"Matrix multiplication on CPU completed in {end_cpu - start_cpu:.4f} seconds.")

# Check if the GPU result matches the CPU result
if np.allclose(C_result, C_cpu, atol=1e-3):  # Allow for small floating point tolerance
    print("GPU result matches CPU result!")
else:
    print("Mismatch between GPU and CPU results.")


Matrix multiplication on GPU completed in 15.1931 seconds.
Matrix multiplication on CPU completed in 2.7354 seconds.
GPU result matches CPU result!
