This code models the memory access pattern of:

-Convolutions

-Decoder refinement layers

-U-Net skip connections

Specifically:

Large 2D feature maps

Each output pixel depends on nearby input pixels

In [27]:
%%writefile neighborhood_access.cu
#include <stdio.h>

// Each thread processes ONE pixel and reads its 4 neighbors
__global__ void neighborhood_kernel(
    float* input,
    float* output,
    int width,
    int height
) {
    // Global pixel coordinates
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Avoid boundary pixels (simplest choice)
    if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
        int idx = y * width + x;

        float center = input[idx];
        float left   = input[idx - 1];
        float right  = input[idx + 1];
        float up     = input[idx - width];
        float down   = input[idx + width];

        output[idx] = (center + left + right + up + down) / 5.0f;
    }
}

int main() {
    // Image size (small for clarity)
    int width = 8;
    int height = 8;
    int N = width * height;

    // Host memory
    float h_input[N];
    float h_output[N];

    // Fill input with known values
    for (int i = 0; i < N; i++) {
        h_input[i] = (float)i;
    }

    // Device memory
    float* d_input;
    float* d_output;

    cudaMalloc(&d_input, N * sizeof(float));
    cudaMalloc(&d_output, N * sizeof(float));

    // Copy input from CPU â†’ GPU
    cudaMemcpy(d_input, h_input, N * sizeof(float),
               cudaMemcpyHostToDevice);

    // 2D execution configuration
    dim3 threadsPerBlock(16, 16);
    dim3 blocks(
        (width + threadsPerBlock.x - 1) / threadsPerBlock.x,
        (height + threadsPerBlock.y - 1) / threadsPerBlock.y
    );

    // Launch kernel
    neighborhood_kernel<<<blocks, threadsPerBlock>>>(
        d_input, d_output, width, height
    );
    cudaDeviceSynchronize();

    // Error check
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(err));
    }

    // Copy result back
    cudaMemcpy(h_output, d_output, N * sizeof(float),
               cudaMemcpyDeviceToHost);

    // Print output as 2D image
    printf("Output feature map:\n");
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            printf("%6.1f ", h_output[y * width + x]);
        }
        printf("\n");
    }

    cudaFree(d_input);
    cudaFree(d_output);
    return 0;
}


Writing neighborhood_access.cu


In [29]:
!nvcc -arch=native neighborhood_access.cu -o neighborhood_access

In [30]:
!./neighborhood_access

Output feature map:
   0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0 
   0.0    9.0   10.0   11.0   12.0   13.0   14.0    0.0 
   0.0   17.0   18.0   19.0   20.0   21.0   22.0    0.0 
   0.0   25.0   26.0   27.0   28.0   29.0   30.0    0.0 
   0.0   33.0   34.0   35.0   36.0   37.0   38.0    0.0 
   0.0   41.0   42.0   43.0   44.0   45.0   46.0    0.0 
   0.0   49.0   50.0   51.0   52.0   53.0   54.0    0.0 
   0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0 
