In [None]:
import time
from PIL import Image
import numpy as np
from math import exp, sqrt
import os
from numba import njit, prange, set_num_threads, cuda

sobel_x = np.array([[-1, 0, 1],
                    [-2, 0, 2],
                    [-1, 0, 1]], dtype=np.int32)

sobel_y = np.array([[-1, -2, -1],
                    [ 0,  0,  0],
                    [ 1,  2,  1]], dtype=np.int32)

def read_image(path):
    try:
        return Image.open(path)
    except IOError as e:
        print(f"Error: {e}")
        return None

def write_image(image, path):
    create_outputs_directory_if_needed()
    try:
        image.save(path)
    except IOError as e:
        print(f"Error: {e}")

def create_outputs_directory_if_needed():
    outputs_path = "outputs"
    if not os.path.exists(outputs_path):
        os.makedirs(outputs_path)

def apply_gaussian_blur(image, kernel, radius, parallel=False):
    img_array = np.array(image)  # Convert PIL Image to NumPy array
    if parallel:
        blurred_img_array = apply_gaussian_blur_kernel_parallel(img_array, kernel, radius)
    else:
        blurred_img_array = apply_gaussian_blur_kernel_serial(img_array, kernel, radius)
    return Image.fromarray(blurred_img_array)  # Convert back to PIL Image

def apply_sobel(image, parallel=False):
    img_array = np.array(image.convert('L'))  # Convert to grayscale NumPy array
    if parallel:
        edges_img_array = apply_sobel_filter_parallel(img_array)
    else:
        edges_img_array = apply_sobel_filter_serial(img_array)
    return Image.fromarray(edges_img_array).convert('RGB')  # Convert back to PIL Image in RGB

def generate_gaussian_blur_kernel(radius, sigma):
    size = 2 * radius + 1
    kernel = np.zeros((size, size))
    sum_val = 0.0

    for i in range(-radius, radius + 1):
        for j in range(-radius, radius + 1):
            kernel[i + radius, j + radius] = exp(-(i**2 + j**2) / (2 * sigma**2))
            sum_val += kernel[i + radius, j + radius]

    kernel /= sum_val
    return kernel

def apply_gaussian_blur_kernel_serial(img_array, kernel, radius):
    height, width = img_array.shape[:2]
    padded_img = np.pad(img_array, ((radius, radius), (radius, radius), (0, 0)), 'edge')
    blurred_img = np.zeros_like(img_array)

    for y in range(height):
        for x in range(width):
            for c in range(3):  # For each color channel
                acc = np.sum(kernel * padded_img[y:y+2*radius+1, x:x+2*radius+1, c])
                blurred_img[y, x, c] = min(max(int(acc), 0), 255)

    return blurred_img

def apply_sobel_filter_serial(img_array):
    height, width = img_array.shape
    edges_img = np.zeros_like(img_array)

    for y in range(1, height-1):
        for x in range(1, width-1):
            gx = np.sum(sobel_x * img_array[y-1:y+2, x-1:x+2])
            gy = np.sum(sobel_y * img_array[y-1:y+2, x-1:x+2])
            edges_img[y, x] = min(sqrt(gx**2 + gy**2), 255)

    return edges_img

@njit(parallel=True)
def apply_gaussian_blur_kernel_parallel(img_array, kernel, radius):
    height, width = img_array.shape[:2]
    blurred_img = np.zeros_like(img_array)

    for y in prange(height):
        for x in prange(width):
            for c in range(3):  # For each color channel
                sum_val = 0.0
                for ky in range(-radius, radius + 1):
                    for kx in range(-radius, radius + 1):
                        px = min(max(x + kx, 0), width - 1)
                        py = min(max(y + ky, 0), height - 1)
                        sum_val += img_array[py, px, c] * kernel[ky + radius, kx + radius]
                blurred_img[y, x, c] = sum_val
    return blurred_img

@njit(parallel=True)
def apply_sobel_filter_parallel(img_array):
    height, width = img_array.shape
    edges_img = np.zeros((height, width), np.uint8)

    for y in prange(1, height-1):
        for x in prange(1, width-1):
            gx = 0.0
            gy = 0.0
            # Apply the Sobel kernel to the grayscale image
            for ky in range(-1, 2):
                for kx in range(-1, 2):
                    val = img_array[y + ky, x + kx]
                    gx += val * sobel_x[ky + 1, kx + 1]
                    gy += val * sobel_y[ky + 1, kx + 1]
            magnitude = int(min(np.sqrt(gx**2 + gy**2), 255))
            edges_img[y, x] = magnitude
    return edges_img

def apply_gaussian_blur_chunk(img_array, kernel, radius):
    height, width = img_array.shape[:2]
    blurred_img = np.zeros_like(img_array)
    for y in range(height):
        for x in range(width):
            for c in range(3):
                sum_val = 0.0
                for ky in range(-radius, radius + 1):
                    for kx in range(-radius, radius + 1):
                        py = min(max(y + ky, 0), height - 1)
                        px = min(max(x + kx, 0), width - 1)
                        sum_val += img_array[py, px, c] * kernel[ky + radius, kx + radius]
                blurred_img[y, x, c] = sum_val
    return blurred_img

def apply_sobel_chunk(img_array):
    height, width = img_array.shape
    edges_img = np.zeros((height, width), np.uint8)
    for y in range(1, height - 1):
        for x in range(1, width - 1):
            gx = np.sum(sobel_x * img_array[y - 1:y + 2, x - 1:x + 2])
            gy = np.sum(sobel_y * img_array[y - 1:y + 2, x - 1:x + 2])
            magnitude = int(min(np.sqrt(gx**2 + gy**2), 255))
            edges_img[y, x] = magnitude
    return edges_img

@cuda.jit
def apply_gaussian_blur_kernel_cuda(img_array, kernel, radius, blurred_img):
    y, x = cuda.grid(2)
    height, width = img_array.shape[:2]
    if y < height and x < width:
        for c in range(3):  # For each color channel
            sum_val = 0.0
            for ky in range(-radius, radius + 1):
                for kx in range(-radius, radius + 1):
                    py = min(max(y + ky, 0), height - 1)
                    px = min(max(x + kx, 0), width - 1)
                    sum_val += img_array[py, px, c] * kernel[ky + radius, kx + radius]
            blurred_img[y, x, c] = sum_val

def apply_gaussian_blur_cuda(img_array, kernel, radius, threadsperblock):
    # Allocate output array
    blurred_img = np.zeros_like(img_array)

    # Copy data to device
    d_img_array = cuda.to_device(img_array)
    d_kernel = cuda.to_device(kernel)
    d_blurred_img = cuda.to_device(blurred_img)

    # Setup the grid and blocks
    blockspergrid_x = int(np.ceil(img_array.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(np.ceil(img_array.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    # Launch kernel
    apply_gaussian_blur_kernel_cuda[blockspergrid, threadsperblock](d_img_array, d_kernel, radius, d_blurred_img)

    # Copy result back to host
    d_blurred_img.copy_to_host(blurred_img)

    return blurred_img

@cuda.jit
def apply_sobel_filter_kernel_cuda(img_array, sobel_x, sobel_y, edges_img):
    y, x = cuda.grid(2)
    height, width = img_array.shape[:2]
    if y < height and x < width:
        gx = 0.0
        gy = 0.0
        for ky in range(-1, 2):
            for kx in range(-1, 2):
                val = img_array[y + ky, x + kx]
                gx += val * sobel_x[ky + 1, kx + 1]
                gy += val * sobel_y[ky + 1, kx + 1]
        magnitude = int(min(sqrt(gx**2 + gy**2), 255))
        edges_img[y, x] = magnitude

def apply_sobel_filter_cuda(img_array, threadsperblock):
    edges_img = np.zeros_like(img_array)

    # Copy data to device
    d_img_array = cuda.to_device(img_array)
    d_edges_img = cuda.to_device(edges_img)

    # Setup the grid and blocks
    blockspergrid_x = int(np.ceil(img_array.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(np.ceil(img_array.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    # Launch kernel
    apply_sobel_filter_kernel_cuda[blockspergrid, threadsperblock](d_img_array, sobel_x, sobel_y, d_edges_img)

    # Copy result back to host
    d_edges_img.copy_to_host(edges_img)

    return edges_img

def create_blurred_image(image_path, output_path, radius, sigma, parallel):
    input_image = read_image(image_path)
    if input_image is None:
        print("Error: The input image could not be read")
        return
    kernel = generate_gaussian_blur_kernel(radius, sigma)
    blurred_image = apply_gaussian_blur(input_image, kernel, radius, parallel)
    write_image(blurred_image, output_path)

def create_image_with_sharp_edges(image_path, output_path, parallel):
    input_image = read_image(image_path)
    if input_image is None:
        print("Error: The input image could not be read")
        return
    edges_image = apply_sobel(input_image, parallel)
    write_image(edges_image.convert('RGB'), output_path)  # Convert grayscale to RGB before saving

def create_blurred_image_cuda(image_path, output_path, radius, sigma, threadsperblock):
    input_image = read_image(image_path)
    if input_image is None:
        print("Error: The input image could not be read")
        return
    kernel = generate_gaussian_blur_kernel(radius, sigma)
    img_array = np.array(input_image)
    blurred_img_array = apply_gaussian_blur_cuda(img_array, kernel, radius, threadsperblock)
    blurred_image = Image.fromarray(blurred_img_array.astype('uint8'), 'RGB')
    write_image(blurred_image, output_path)

def create_image_with_sharp_edges_cuda(image_path, output_path, threadsperblock):
    input_image = read_image(image_path)
    if input_image is None:
        print("Error: The input image could not be read")
        return
    img_array = np.array(input_image.convert('L'))
    edges_img_array = apply_sobel_filter_cuda(img_array, threadsperblock)
    edges_image = Image.fromarray(edges_img_array, 'L').convert('RGB')
    write_image(edges_image, output_path)

def run_and_test_blur_serial_scalability(image_path, output_path):
    sigma = 20.0
    for radius in [1, 3, 5, 7, 9]:
    # for radius in [1, 5, 7, 10, 20]:
        start_time = time.time()
        print(f"Running with radius {radius}...")
        create_blurred_image(image_path, output_path, radius, sigma, False)
        execution_time = time.time() - start_time
        print(f"    Execution time: {execution_time:.2f} seconds")

def run_test_blur_cuda_scalability_test(image_path, output_path):
    input_image = read_image(image_path)
    img_array = np.array(input_image)
    threadsperblocks = [(16, 16), (32, 8), (8, 32), (32, 32), (10, 10)] # on T4
    # threadsperblocks = [(8, 8), (16, 16), (32, 4), (4, 32), (14, 14)] # on A100 & V100

    for threadsperblock in threadsperblocks:
        print(f"Running with {threadsperblock} threads per block...")
        for radius in [5, 10, 20, 40, 80]:
            start = time.time()
            kernel = generate_gaussian_blur_kernel(radius, 20.0)
            blurred_img_array = apply_gaussian_blur_cuda(img_array, kernel, radius, threadsperblock)
            blurred_image = Image.fromarray(blurred_img_array.astype('uint8'), 'RGB')
            write_image(blurred_image, output_path)
            end = time.time()
            print(f"    Execution time: {end - start:.2f} seconds, radius: {radius}")

def run_and_test_sobel_serial_scalability(image_path, output_path):
    for i in range(1, 6):
        start_time = time.time()
        print(f"Executing run number {i}...")
        create_image_with_sharp_edges(image_path, output_path, False)
        execution_time = time.time() - start_time
        print(f"    Execution time: {execution_time:.2f} seconds")

def run_and_test_sobel_cuda_scalability(image_path, output_path):
    input_image = read_image(image_path)
    img_array = np.array(input_image.convert('L'))
    threadsperblocks = [(16, 16), (32, 8), (8, 32), (32, 32), (10, 10)] # on T4
    # threadsperblocks = [(8, 8), (16, 16), (32, 4), (4, 32), (14, 14)] # on A100 & V100

    apply_sobel_filter_cuda(img_array, (16, 16))  # trigger jit compilation

    for threadsperblock in threadsperblocks:
        print(f"Running with {threadsperblock} threads per block...")
        start = time.time()
        edges_img_array = apply_sobel_filter_cuda(img_array, threadsperblock)
        edges_image = Image.fromarray(edges_img_array, 'L').convert('RGB')
        write_image(edges_image, output_path)
        end = time.time()
        print(f"    Execution time: {end - start:.2f} seconds")

if __name__ == "__main__":
    # Adjust the following variables as needed
    file_name = "squidward_painting"
    file_extension = "jpg"
    input_image_path = f"assets/{file_name}.{file_extension}"
    output_blurred_path = f"outputs/{file_name}_blurred_cuda.{file_extension}"
    output_edges_path = f"outputs/{file_name}_sobel_cuda.{file_extension}"

    radius = 9
    sigma = 20.0
    threadsperblock = (16, 16)

    # Gaussian Blur
    create_blurred_image_cuda(input_image_path, output_blurred_path, radius, sigma, threadsperblock)
    # run_and_test_blur_serial_scalability(input_image_path, output_blurred_path)
    # run_test_blur_cuda_scalability_test(input_image_path, output_blurred_path)
    
    # Sobel Edge Detection
    create_image_with_sharp_edges_cuda(input_image_path, output_edges_path, threadsperblock)
    # run_and_test_sobel_serial_scalability(input_image_path, output_edges_path)
    # run_and_test_sobel_cuda_scalability(input_image_path, output_edges_path)