In [1]:
!nvidia-smi


Fri Jan 30 18:01:20 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [19]:
%%writefile pixel_threshold.cu
#include <stdio.h>

__global__ void threshold_kernel(float* input, int* output, int N, float thresh) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        output[idx] = (input[idx] > thresh) ? 1 : 0;
    }
}

int main() {
    int N = 16; // total number of data elements we want to process in parallel on the GPU.

    //host (cpu) memory

    float h_input[16];
    int h_output[16];

    for (int i = 0; i < N; i++) {
        h_input[i] = i * 0.1f;
    }

    //device (gpu) memory
    float* d_input;
    int* d_output;

    cudaMalloc(&d_input, N * sizeof(float));
    cudaMalloc(&d_output, N * sizeof(int));

    cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 8;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    threshold_kernel<<<blocks, threadsPerBlock>>>(d_input, d_output, N, 0.5f);
    cudaDeviceSynchronize(); // CPU waits  until the GPU finishes all work

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(err));
    }

    cudaMemcpy(h_output, d_output, N * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < N; i++) {
        printf("%d ", h_output[i]);
    }
    printf("\n");

    cudaFree(d_input);
    cudaFree(d_output);
    return 0;
}


Overwriting pixel_threshold.cu


In [20]:
!nvcc -arch=sm_75 pixel_threshold.cu -o pixel_threshold


In [21]:
!./pixel_threshold


0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 
