In [1]:
#include <iostream>
#include <cmath>
#include <cuda_runtime.h>
#include<vector>


In [2]:
const int kernelWidth = 3;
float kernel[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};



In [3]:
// const int half = kernelWidth / 2;
//         float blur = 0.0;
//         for(int i = -half; i <= half; i++) {
//             for(int j = -half; j <= half; j++) {

//                 const unsigned int y = max(0, min(height - 1, row + i));
//                 const unsigned int x = max(0, min(width - 1, col + j));

//                 const float w = kernel[(j + half) + (i + half) * kernelWidth];
//                 blur += w * input[x + y * width];
//             }
//         }

In [4]:
__global__ void gaussianBlur(float* input, float* output, const int width, const int height, const float *kernel, const int kernelWidth) {
    const unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;
    const unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;

    if(row < height && col < width) {
//         float sum = 0.0f;
//         int halfKernelWidth = kernelWidth / 2;

//         // Convolve the pixel with the kernel
//         for(int i = -halfKernelWidth; i <= halfKernelWidth; i++) {
//             for(int j = -halfKernelWidth; j <= halfKernelWidth; j++) {
//                 int curRow = min(max(row + i, 0), height - 1);
//                 int curCol = min(max(col + j, 0), width - 1);

//                 float pixelValue = input[curCol + curRow * width];
//                 float kernelValue = kernel[(j + halfKernelWidth) + (i + halfKernelWidth) * kernelWidth];

//                 sum += pixelValue * kernelValue;
//             }
//         }

        output[col + row * width] = input[col + row * width];
    }
}

In [5]:
const int width = 512;  
const int height = 512;

In [6]:
float* h_input = new float[width * height];
float* h_output = new float[width * height];

In [7]:
!pip install Pillow



In [8]:
!pip install numpy



In [9]:
!pip install matplotlib



In [10]:
%%python

from PIL import Image
import numpy as np


image = Image.open('test1.jpg') 
image = image.resize((512, 512))

image_array = np.array(image)
if len(image_array.shape) == 3 and image_array.shape[2] == 3:
    image_array = np.array(image.convert('L'))

In [11]:
%%python

print(image_array)

[[126 126 125 ...  84  84  84]
 [126 126 125 ...  84  84  84]
 [127 126 126 ...  84  84  84]
 ...
 [ 58  55  63 ...  19  15   8]
 [ 57  56  53 ...  23  17  16]
 [ 55  50  58 ...   5  16  16]]


In [12]:
void displayImgArray(float* input) {
    for (int i = 0; i < 3; i++) {
        std::cout << input[i] << " "; 
    }

    std::cout << " ... ";

    for (int i = width * height - 3; i < width * height; i++) {
        std::cout << input[i] << " "; 
    }
}

In [13]:
%%python

import cppyy

img_list = image_array.flatten().tolist()
img_vector = cppyy.gbl.std.vector['float'](img_list)

In [14]:

void setImg(const std::vector<float>& input) {
    if (h_input != nullptr) {
        delete[] h_input;
    }

    h_input = new float[input.size()];

    for (size_t i = 0; i < input.size(); i++) {
        h_input[i] = input[i]; // No casting needed
    }
}


In [15]:
std::vector<float> getOutput() {
    std::vector<float> res;
    for (size_t i = 0; i < width * height; i++) {
        res.push_back(h_output[i]);
    }
    return res;
}

In [16]:
%%python

cppyy.gbl.setImg(img_vector)

In [17]:
displayImgArray(h_input);

126 126 125  ... 5 16 16 

In [18]:
float* d_input;
float* d_output;

cudaMalloc((void**)&d_input, width * height * sizeof(float));
cudaMalloc((void**)&d_output, width * height * sizeof(float));

cudaMemcpy(d_input, h_input, width * height * sizeof(float), cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);

gaussianBlur<<<dimGrid, dimBlock>>>(d_input, d_output, width, height, kernel, kernelWidth);

cudaMemcpy(h_output, d_output, width * height * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(d_input);
cudaFree(d_output)

In [19]:
displayImgArray(h_output);

126 126 125  ... 5 16 16 

In [20]:
std::vector<float> blurredRes = getOutput();

In [21]:
%%python

k = np.array(cppyy.gbl.blurredRes, dtype = np.uint8)
k.resize(512, 512)

In [22]:
%%python

Image.fromarray(k).save("test_out.png")

<img src="./test_out.png" align=left width="400">