In [1]:
#include <iostream>
#include <cmath>
#include <cuda_runtime.h>
#include<vector>


In [2]:
__global__ void thresholdKernel(float* input, float* output, const int width, const int height) {
    const unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;
    const unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;
      if (row < height && col < width) {
        
    if(input[col + row * width] > 200)
        output[col + row * width] = input[col + row * width] * 2;
    else
        output[col + row * width] = input[col + row * width] * 0.4;
    }
}

In [3]:
const int width = 512;  
const int height = 512;

In [4]:
float* h_input = new float[width * height];
float* h_output = new float[width * height];

In [5]:
!pip install Pillow



In [6]:
!pip install numpy



In [7]:
!pip install matplotlib



In [8]:
%%python

from PIL import Image
import numpy as np


image = Image.open('img_in.jpg') 
image = image.resize((512, 512))

image_array = np.array(image)
if len(image_array.shape) == 3 and image_array.shape[2] == 3:
    image_array = np.array(image.convert('L'))

In [9]:
%%python

print(image_array)

[[126 126 125 ...  84  84  84]
 [126 126 125 ...  84  84  84]
 [127 126 126 ...  84  84  84]
 ...
 [ 58  55  63 ...  19  15   8]
 [ 57  56  53 ...  23  17  16]
 [ 55  50  58 ...   5  16  16]]


In [10]:
void displayImgArray(float* input) {
    for (int i = 0; i < 3; i++) {
        std::cout << input[i] << " "; 
    }

    std::cout << " ... ";

    for (int i = width * height - 3; i < width * height; i++) {
        std::cout << input[i] << " "; 
    }
}

In [11]:
%%python

import cppyy

img_list = image_array.flatten().tolist()
img_vector = cppyy.gbl.std.vector['float'](img_list)

In [12]:

void setImg(const std::vector<float>& input) {
    if (h_input != nullptr) {
        delete[] h_input;
    }

    h_input = new float[input.size()];

    for (size_t i = 0; i < input.size(); i++) {
        h_input[i] = input[i]; // No casting needed
    }
}


In [13]:
std::vector<float> getOutput() {
    std::vector<float> res;
    for (size_t i = 0; i < width * height; i++) {
        res.push_back(h_output[i]);
    }
    return res;
}

In [14]:
%%python

cppyy.gbl.setImg(img_vector)

In [15]:
displayImgArray(h_input);

126 126 125  ... 5 16 16 

In [16]:
float* d_input;
float* d_output;

cudaMalloc((void**)&d_input, width * height * sizeof(float));
cudaMalloc((void**)&d_output, width * height * sizeof(float));

cudaMemcpy(d_input, h_input, width * height * sizeof(float), cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);

thresholdKernel<<<dimGrid, dimBlock>>>(d_input, d_output, width, height);

cudaMemcpy(h_output, d_output, width * height * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(d_input);
cudaFree(d_output)

In [17]:
displayImgArray(h_output);

50.4 50.4 50  ... 2 6.4 6.4 

In [18]:
std::vector<float> blurredRes = getOutput();

In [19]:
%%python

k = np.array(cppyy.gbl.blurredRes, dtype = np.uint8)
k.resize(512, 512)

In [20]:
%%python

Image.fromarray(k).save("img_out.jpg")

<img src="img_out.jpg" align=right width="400">
<img src="img_in.jpg" align=left width="400">