<a href="https://colab.research.google.com/github/gummadhav/Let_us_Learn/blob/main/SingleThreaded_CUDA_Stream_With_PipeLining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
%%shell
# Update package lists
apt-get update

# Install essential build tools (if not already present, good practice)
apt-get install -y build-essential

# Install OpenCV development libraries (C++ headers and shared libraries)
# 'libopencv-dev' provides the development files
# 'python3-opencv' is for Python, but sometimes pulled in as a dependency
apt-get install -y libopencv-dev python3-opencv

# Verify OpenCV version (optional, but good for checking)
pkg-config --modversion opencv4 # For OpenCV 4.x
# Or for older versions: pkg-config --modversion opencv

/bin/bash: line 1: fg: no job control
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [80.4 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,002 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,789 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,237 kB]
Hit:12 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:1



In [None]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp0t99kg18".


In [None]:
%%shell
nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0




In [None]:
%%shell
nvidia-smi

Sun Sep  7 15:40:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                



In [None]:
%%writefile single_threaded_CUDA_Stream_with_pipelining.cu

#include <opencv2/opencv.hpp>
#include <vector>
#include <string>
#include <filesystem>
#include <iostream>
#include "cuda_runtime.h"

// Macro for CUDA error checking
#define CHECK_CUDA(call) \
    { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    }

// Utility: get image paths from folder
std::vector<std::string> get_image_paths(const std::string& folder) {
    std::vector<std::string> paths;
    if (!std::filesystem::exists(folder)) {
        std::cerr << "Input folder does not exist: " << folder << std::endl;
        return paths;
    }
    for (const auto& entry : std::filesystem::directory_iterator(folder)) {
        if (entry.is_regular_file() && entry.path().extension() == ".bmp") {
            paths.push_back(entry.path().string());
        }
    }
    return paths;
}

// CUDA Gaussian Filter kernel
__global__
void gaussian_filter_kernel(const uchar* input, uchar* output, int width, int height, const float* kernel) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;
    float sum = 0, kSum = 0;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            int ix = min(max(x+dx,0), width-1);
            int iy = min(max(y+dy,0), height-1);
            float k = kernel[(dy+1)*3 + (dx+1)];
            sum += input[iy*width + ix] * k;
            kSum += k;
        }
    }
    float result = kSum > 0 ? sum / kSum : 0;
    result = min(max(result, 0.0f), 255.0f);
    output[y*width + x] = static_cast<uchar>(result);
}

// CUDA Laplacian Edge Detection kernel
__global__
void laplacian_filter_kernel(const uchar* input, uchar* output, int width, int height) {
    int kernel[3][3] = { {-1,-1,-1}, {-1,8,-1}, {-1,-1,-1} };
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;
    int sum = 0;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            int ix = min(max(x+dx,0), width-1);
            int iy = min(max(y+dy,0), height-1);
            sum += input[iy*width + ix] * kernel[dy+1][dx+1];
        }
    }
    int result = sum + 128;
    result = min(max(result, 0), 255);
    output[y*width + x] = static_cast<uchar>(result);
}

// Empty kernel for warm-up
__global__ void empty_kernel(int* p) { *p = 1; }

// Warm-up function to prime the CUDA context
void warm_up() {
    std::cout << "Warming up CUDA..." << std::endl;
    int* d_temp;
    // Use a small allocation for warm-up to avoid large memory footprint
    CHECK_CUDA(cudaMalloc(&d_temp, sizeof(int)));
    empty_kernel<<<1, 1>>>(d_temp);
    CHECK_CUDA(cudaDeviceSynchronize());
    CHECK_CUDA(cudaFree(d_temp));
    std::cout << "Warm-up complete." << std::endl;
}

int main() {
    std::string input_folder  = "input_images";
    std::string output_folder = "output_images";
    std::filesystem::create_directories(output_folder);

    auto paths = get_image_paths(input_folder);
    if (paths.empty()) {
        std::cerr << "No BMP images found in " << input_folder << std::endl;
        return 1;
    }

    warm_up();

    // Use a single CUDA stream for all operations
    cudaStream_t stream;
    CHECK_CUDA(cudaStreamCreate(&stream));

    // Allocate two sets of host-side input and output buffers for pipelining
    // Use cv::Mat for host buffers to leverage OpenCV's image handling
    std::vector<cv::Mat> h_input(2);
    std::vector<cv::Mat> h_output(2);

    // Allocate two sets of device-side buffers (ping-pong)
    uchar *d_input[2], *d_gauss[2], *d_laplace[2];
    float *d_gkernel;

    // Get dimensions from the first image to allocate buffers
    h_input[0] = cv::imread(paths[0], cv::IMREAD_GRAYSCALE);
    if (h_input[0].empty()) {
        std::cerr << "Failed to load first image: " << paths[0] << std::endl;
        return 1;
    }
    int width   = h_input[0].cols;
    int height  = h_input[0].rows;
    size_t size = width * height * sizeof(uchar);

    // One-time allocation of device buffers and kernel
    for (int i = 0; i < 2; ++i) {
        CHECK_CUDA(cudaMalloc(&d_input[i], size));
        CHECK_CUDA(cudaMalloc(&d_gauss[i], size));
        CHECK_CUDA(cudaMalloc(&d_laplace[i], size));
        // Allocate host output buffer as well
        h_output[i].create(height, width, CV_8U);
    }
    CHECK_CUDA(cudaMalloc(&d_gkernel, 9 * sizeof(float)));

    // Copy the Gaussian kernel to the device
    float gaussKernel[9] = {1.f/16, 2.f/16, 1.f/16, 2.f/16, 4.f/16, 2.f/16, 1.f/16, 2.f/16, 1.f/16};
    CHECK_CUDA(cudaMemcpy(d_gkernel, gaussKernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

    dim3 block(16,16);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); // Correct grid calculation
    cv::TickMeter timer;
    timer.start();

    // --- PRIMING THE PIPELINE ---
    // Load and asynchronously copy the first image (idx 0)
    CHECK_CUDA(cudaMemcpyAsync(d_input[0], h_input[0].data, size, cudaMemcpyHostToDevice, stream));

    // --- MAIN PIPELINE LOOP ---
    for (size_t i = 0; i < paths.size(); ++i) {
        int current_idx = i % 2;
        int next_idx = (i + 1) % 2;

        // 1. Asynchronously read the next image and copy it to device
        if (i + 1 < paths.size()) {
            h_input[next_idx] = cv::imread(paths[i+1], cv::IMREAD_GRAYSCALE);
            if (h_input[next_idx].empty()) {
                std::cerr << "Failed to load image " << paths[i+1] << ". Skipping." << std::endl;
                 // If loading fails, we should continue with the pipeline for the current image,
                 // but the next image slot will be empty. This might cause issues if not handled carefully.
                 // For simplicity in this fix, we'll just print an error and the next image won't be processed correctly.
                 // A more robust solution would involve checking h_input[next_idx].empty() before the next iteration's memcpy.
            } else {
                 // Ensure the dimensions match if processing multiple images.
                 // This code assumes all images have the same dimensions as the first one.
                 if (h_input[next_idx].cols != width || h_input[next_idx].rows != height) {
                     std::cerr << "Image dimensions mismatch for " << paths[i+1] << ". Skipping." << std::endl;
                     h_input[next_idx].release(); // Release the loaded image
                 } else {
                     CHECK_CUDA(cudaMemcpyAsync(d_input[next_idx], h_input[next_idx].data, size, cudaMemcpyHostToDevice, stream));
                 }
            }
        }

        // 2. Launch kernels for the current image. These kernels will start
        //    as soon as the previous HtoD copy for this buffer is complete.
        //    Ensure kernels are only launched if the current image was successfully loaded.
        if (!h_input[current_idx].empty()) {
            gaussian_filter_kernel<<<grid, block, 0, stream>>>(d_input[current_idx], d_gauss[current_idx], width, height, d_gkernel);
            laplacian_filter_kernel<<<grid, block, 0, stream>>>(d_gauss[current_idx], d_laplace[current_idx], width, height);

            // 3. Asynchronously copy the result back to the host
            CHECK_CUDA(cudaMemcpyAsync(h_output[current_idx].data, d_laplace[current_idx], size, cudaMemcpyDeviceToHost, stream));

            // Wait for the current DtoH copy to finish before saving the file
            // This is the only synchronization point necessary within the loop
            CHECK_CUDA(cudaStreamSynchronize(stream));

            // 4. Save the output file to disk
            std::string output_path = output_folder + "/" + std::filesystem::path(paths[i]).stem().string() + "_gpu.bmp";
            if (!cv::imwrite(output_path, h_output[current_idx])) {
                std::cerr << "Failed to save GPU image: " << output_path << std::endl;
            }
            std::cout << "Processed: " << paths[i] << std::endl;
        } else {
             std::cerr << "Skipping processing for " << paths[i] << " due to previous loading failure." << std::endl;
        }
    }

    // Ensure all operations on the stream are complete before cleanup
    CHECK_CUDA(cudaStreamSynchronize(stream));

    timer.stop();
    std::cout << "\nTotal execution time for " << paths.size() << " images: " << timer.getTimeMilli() << " ms" << std::endl;

    // --- CLEANUP ---
    for (int i = 0; i < 2; ++i) {
        CHECK_CUDA(cudaFree(d_input[i]));
        CHECK_CUDA(cudaFree(d_gauss[i]));
        CHECK_CUDA(cudaFree(d_laplace[i]));
    }
    CHECK_CUDA(cudaFree(d_gkernel));
    CHECK_CUDA(cudaStreamDestroy(stream));

    return 0;
}


Overwriting single_threaded_CUDA_Stream_with_pipelining.cu


In [None]:
%%shell
nvcc single_threaded_CUDA_Stream_with_pipelining.cu -o single_threaded_CUDA_Stream_with_pipelining $(pkg-config --cflags --libs opencv4) -arch=sm_75

   11 |     { \
      |        
  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

   11 |     { \
      |        
  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^





In [None]:
%%shell
ls -l single_threaded_CUDA_Stream_with_pipelining

-rwxr-xr-x 1 root root 1083408 Sep  7 15:55 single_threaded_CUDA_Stream_with_pipelining




In [None]:
%%shell
./single_threaded_CUDA_Stream_with_pipelining

Warming up CUDA...
Warm-up complete.
Processed: input_images/lena.bmp
Processed: input_images/blackbuck.bmp

Total execution time for 2 images: 2.08594 ms




In [19]:
%%writefile single_threaded_CUDA_Stream_with_no_pipelining.cu

#include <opencv2/opencv.hpp>
#include <vector>
#include <string>
#include <filesystem>
#include <iostream>
#include "cuda_runtime.h"

// Macro for CUDA error checking
#define CHECK_CUDA(call) \
    { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    }

// Utility: get image paths from folder
std::vector<std::string> get_image_paths(const std::string& folder) {
    std::vector<std::string> paths;
    for (const auto& entry : std::filesystem::directory_iterator(folder)) {
        if (entry.path().extension() == ".bmp") {
            paths.push_back(entry.path().string());
        }
    }
    return paths;
}

// CUDA Gaussian Filter kernel
__global__
void gaussian_filter_kernel(const uchar* input, uchar* output, int width, int height, const float* kernel) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;
    float sum = 0, kSum = 0;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            int ix = min(max(x+dx,0), width-1);
            int iy = min(max(y+dy,0), height-1);
            float k = kernel[(dy+1)*3 + (dx+1)];
            sum += input[iy*width + ix] * k;
            kSum += k;
        }
    }
    float result = kSum > 0 ? sum / kSum : 0;
    result = min(max(result, 0.0f), 255.0f);
    output[y*width + x] = static_cast<uchar>(result);
}

// CUDA Laplacian Edge Detection kernel
__global__
void laplacian_filter_kernel(const uchar* input, uchar* output, int width, int height) {
    int kernel[3][3] = { {-1,-1,-1}, {-1,8,-1}, {-1,-1,-1} };
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;
    int sum = 0;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            int ix = min(max(x+dx,0), width-1);
            int iy = min(max(y+dy,0), height-1);
            sum += input[iy*width + ix] * kernel[dy+1][dx+1];
        }
    }
    int result = sum + 128;
    result = min(max(result, 0), 255);
    output[y*width + x] = static_cast<uchar>(result);
}

// Empty kernel for warm-up
__global__ void empty_kernel(int* p) { *p = 1; }

// Warm-up function to prime the CUDA context
void warm_up() {
    std::cout << "Warming up CUDA..." << std::endl;
    int* d_temp;
    CHECK_CUDA(cudaMalloc(&d_temp, sizeof(int)));
    empty_kernel<<<1, 1>>>(d_temp);
    CHECK_CUDA(cudaDeviceSynchronize());
    CHECK_CUDA(cudaFree(d_temp));
    std::cout << "Warm-up complete." << std::endl;
}

int main() {
    std::string input_folder = "input_images";
    std::string output_folder = "output_images";
    std::filesystem::create_directories(output_folder);

    auto paths = get_image_paths(input_folder);
    if (paths.empty()) {
        std::cerr << "No BMP images found in " << input_folder << std::endl;
        return 1;
    }

    warm_up();

    // Copy the Gaussian kernel to the device once
    float gaussKernel[9] = {1.f/16, 2.f/16, 1.f/16, 2.f/16, 4.f/16, 2.f/16, 1.f/16, 2.f/16, 1.f/16};
    float *d_gkernel;
    CHECK_CUDA(cudaMalloc(&d_gkernel, 9 * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_gkernel, gaussKernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

    cv::TickMeter timer;
    timer.start();

    // --- MAIN SERIAL LOOP ---
    for (const auto& path : paths) {
        // Read the image from disk
        cv::Mat h_input = cv::imread(path, cv::IMREAD_GRAYSCALE);
        if (h_input.empty()) {
            std::cerr << "Failed to load image " << path << std::endl;
            continue;
        }

        int width = h_input.cols;
        int height = h_input.rows;
        size_t size = width * height * sizeof(uchar);

        // Allocate device buffers for this specific image
        uchar *d_input, *d_gauss, *d_laplace;
        CHECK_CUDA(cudaMalloc(&d_input, size));
        CHECK_CUDA(cudaMalloc(&d_gauss, size));
        CHECK_CUDA(cudaMalloc(&d_laplace, size));

        // 1. Synchronous Host-to-Device memory copy
        CHECK_CUDA(cudaMemcpy(d_input, h_input.data, size, cudaMemcpyHostToDevice));

        dim3 block(16,16), grid((width+15)/16, (height+15)/16);

        // 2. Launch kernels
        gaussian_filter_kernel<<<grid, block>>>(d_input, d_gauss, width, height, d_gkernel);
        laplacian_filter_kernel<<<grid, block>>>(d_gauss, d_laplace, width, height);

        // 3. Synchronize with the device to wait for kernel completion
        CHECK_CUDA(cudaDeviceSynchronize());

        // 4. Synchronous Device-to-Host memory copy
        cv::Mat h_output(height, width, CV_8U);
        CHECK_CUDA(cudaMemcpy(h_output.data, d_laplace, size, cudaMemcpyDeviceToHost));

        // 5. Save the output file to disk
        std::string output_path = output_folder + "/" + std::filesystem::path(path).stem().string() + "_gpu.bmp";
        if (!cv::imwrite(output_path, h_output)) {
            std::cerr << "Failed to save GPU image: " << output_path << std::endl;
        }
        std::cout << "Processed: " << path << std::endl;

        // Clean up buffers for this image
        CHECK_CUDA(cudaFree(d_input));
        CHECK_CUDA(cudaFree(d_gauss));
        CHECK_CUDA(cudaFree(d_laplace));
    }

    timer.stop();
    std::cout << "\nTotal execution time for " << paths.size() << " images: " << timer.getTimeMilli() << " ms" << std::endl;

    // Final cleanup of the kernel buffer
    CHECK_CUDA(cudaFree(d_gkernel));

    return 0;
}

Overwriting single_threaded_CUDA_Stream_with_no_pipelining.cu


In [20]:
%%shell

nvcc single_threaded_CUDA_Stream_with_no_pipelining.cu -o single_threaded_CUDA_Stream_with_no_pipelining $(pkg-config --cflags --libs opencv4) -arch=sm_75

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^





In [21]:
%%shell

ls -l single_threaded_CUDA_Stream_with_no_pipelining

-rwxr-xr-x 1 root root 1075224 Sep  7 16:25 single_threaded_CUDA_Stream_with_no_pipelining




In [22]:
%%shell

./single_threaded_CUDA_Stream_with_no_pipelining

Warming up CUDA...
Warm-up complete.
Processed: input_images/lena.bmp
Processed: input_images/blackbuck.bmp

Total execution time for 2 images: 10.3978 ms


