<a href="https://colab.research.google.com/github/emceeashish/travel-planner/blob/main/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
!apt update
!apt install -y libopencv-dev pkg-config

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp3ysmc9l8".
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cli.github.com/packages stable/main amd64 Packages [345 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,227 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:9 http://archive.ubuntu.com/ubuntu jamm

In [25]:
%%writefile cpu_blur.cpp
#include <opencv2/opencv.hpp>
#include <iostream>
#include <chrono>

#include <opencv2/opencv.hpp>
#include <iostream>
#include <chrono>

int main() {
    // Loaded the input image in grayscale format
    cv::Mat img = cv::imread("/content/download.jpg", cv::IMREAD_GRAYSCALE);
    if (img.empty()) {
        std::cout << "Error loading image\n";
        return -1;
    }

    // Stored image width and height for processing
    int w = img.cols;
    int h = img.rows;

    // Created the output image by copying the input image
    cv::Mat result = img.clone();

    // Started measuring CPU execution time
    auto start = std::chrono::high_resolution_clock::now();

    // Defined a 3x3 Gaussian kernel
    int kernel[3][3] = {
        {1, 2, 1},
        {2, 4, 2},
        {1, 2, 1}
    };

    // Applied Gaussian blur while skipping border pixels
    for (int y = 1; y < h - 1; y++) {
        for (int x = 1; x < w - 1; x++) {
            int sum = 0;

            // Computed weighted sum of neighboring pixels
            for (int ky = -1; ky <= 1; ky++) {
                for (int kx = -1; kx <= 1; kx++) {
                    sum += img.at<uchar>(y + ky, x + kx) *
                           kernel[ky + 1][kx + 1];
                }
            }

            // Normalized the result and stored it in the blurred pixel
            result.at<uchar>(y, x) = sum / 16;
        }
    }

    // Stopped timing after the blur computation finished
    auto end = std::chrono::high_resolution_clock::now();
    auto duration =
        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

    // Saved the CPU processed output image
    cv::imwrite("cpu_output.jpg", result);

    // Printed the total CPU execution time
    std::cout << "CPU time: " << duration.count() << " ms\n";

    return 0;
}


Overwriting cpu_blur.cpp


In [26]:
!g++ cpu_blur.cpp -o cpu_blur `pkg-config --cflags --libs opencv4`


In [27]:
!./cpu_blur


CPU time: 12 ms


In [28]:
%%writefile gpu_blur.cu
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <iostream>

// Defined a simple CUDA kernel for Gaussian blur
__global__ void blurKernel(unsigned char* in, unsigned char* out, int w, int h) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Checked if the thread was inside image bounds
    if (x >= w || y >= h) return;

    // Defined a 3x3 Gaussian kernel
    int kernel[3][3] = {
        {1, 2, 1},
        {2, 4, 2},
        {1, 2, 1}
    };

    int sum = 0;

    // Applied blur only for non-border pixels
    if (x > 0 && x < w - 1 && y > 0 && y < h - 1) {
        for (int ky = -1; ky <= 1; ky++) {
            for (int kx = -1; kx <= 1; kx++) {
                sum += in[(y + ky) * w + (x + kx)] *
                       kernel[ky + 1][kx + 1];
            }
        }

        // Normalized the result and stored it in output
        out[y * w + x] = sum / 16;
    }
    else {
        // Copied border pixels directly without modification
        out[y * w + x] = in[y * w + x];
    }
}

int main() {
    // Loaded the input image in grayscale format
    cv::Mat img = cv::imread("/content/download.jpg", cv::IMREAD_GRAYSCALE);
    if (img.empty()) {
        std::cout << "Error loading image\n";
        return -1;
    }

    // Stored image width, height, and total size
    int w = img.cols;
    int h = img.rows;
    int size = w * h;

    // Allocated memory on the GPU
    unsigned char *d_in, *d_out;
    cudaMalloc(&d_in, size);
    cudaMalloc(&d_out, size);

    // Copied the input image from host to device
    cudaMemcpy(d_in, img.data, size, cudaMemcpyHostToDevice);

    // Configured CUDA grid and block dimensions
    dim3 block(16, 16);
    dim3 grid((w + 15) / 16, (h + 15) / 16);

    // Created CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Recorded the start time and launched the kernel
    cudaEventRecord(start);
    blurKernel<<<grid, block>>>(d_in, d_out, w, h);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calculated the GPU execution time
    float time;
    cudaEventElapsedTime(&time, start, stop);

    // Copied the result back from device to host
    cv::Mat result(h, w, CV_8UC1);
    cudaMemcpy(result.data, d_out, size, cudaMemcpyDeviceToHost);

    // Saved the GPU processed output image
    cv::imwrite("gpu_output.jpg", result);

    // Printed the kernel execution time
    std::cout << "GPU time: " << time << " ms\n";

    // Freed allocated GPU memory
    cudaFree(d_in);
    cudaFree(d_out);

    return 0;
}


Overwriting gpu_blur.cu


In [29]:
!nvcc gpu_blur.cu -o gpu_blur \
  `pkg-config --cflags --libs opencv4` \
  -arch=sm_75

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^



In [30]:
!./gpu_blur

GPU time: 0.102272 ms


In [31]:
import cv2
import subprocess
import os

print("=== Gaussian Blur Comparison ===")

# Ran the GPU program to generate the GPU output image
print("\nRunning GPU program...")
subprocess.run(["./gpu_blur"])

# Ran the CPU program to generate the CPU output image
print("\nRunning CPU program...")
subprocess.run(["./cpu_blur"])

# Loaded the output images produced by GPU and CPU programs
gpu_img = cv2.imread("gpu_output.jpg", 0)
cpu_img = cv2.imread("cpu_output.jpg", 0)

# Checked whether both output images were loaded correctly
if gpu_img is None or cpu_img is None:
    print("Error loading output images")
    exit()

# Compared the two output images pixel by pixel
if gpu_img.shape == cpu_img.shape:
    diff = cv2.absdiff(gpu_img, cpu_img)
    same_pixels = (diff == 0).sum()
    total_pixels = gpu_img.shape[0] * gpu_img.shape[1]

    print(f"\n=== Results ===")
    print(f"Total pixels: {total_pixels}")
    print(f"Matching pixels: {same_pixels}")
    print(f"Accuracy: {100 * same_pixels / total_pixels:.2f}%")

    if same_pixels == total_pixels:
        print("✓ Perfect match!")
    else:
        print("⚠ Small differences found")
else:
    print("Error: Output sizes don't match")


=== Gaussian Blur Comparison ===

Running GPU program...

Running CPU program...

=== Results ===
Total pixels: 50325
Matching pixels: 50325
Accuracy: 100.00%
✓ Perfect match!
