<a href="https://colab.research.google.com/github/concurrentes-fiuba/ejemplos-concurrentes/blob/main/practicas/2-vectorizacion/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile indexing.cu

#include <stdio.h>

__global__
void hello() {
  printf("Hello from blockX: %d, blockY: %d, blockZ: %d, threadX: %d, threadY: %d, threadZ: %d\n",
    blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}

int main(void) {

  printf("\n\n\n1D\n\n\n");
  hello<<<8, 64>>>();

  cudaDeviceSynchronize();

  printf("\n\n\n2D\n\n\n");
  hello<<<dim3(2, 4), dim3(8, 8)>>>();

  cudaDeviceSynchronize();

  printf("\n\n\n3D\n\n\n");
  hello<<<dim3(2, 2, 2), dim3(4, 4, 4)>>>();

  cudaDeviceSynchronize();

  return 0;
}



Writing indexing.cpp


In [None]:
%%shell

nvcc indexing.cu -o indexing
./indexing




1D


Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 0, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 1, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 2, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 3, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 4, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 5, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 6, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 7, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 8, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 9, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 10, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0, threadX: 11, threadY: 0, threadZ: 0
Hello from blockX: 2, blockY: 0, blockZ: 0



In [6]:
%%shell
wget https://raw.githubusercontent.com/lvandeve/lodepng/refs/heads/master/lodepng.cpp
wget https://raw.githubusercontent.com/lvandeve/lodepng/refs/heads/master/lodepng.h
wget https://raw.githubusercontent.com/concurrentes-fiuba/ejemplos-concurrentes/refs/heads/main/practicas/2-vectorizacion/data/totk.jpg

--2024-11-28 14:12:40--  https://raw.githubusercontent.com/lvandeve/lodepng/refs/heads/master/lodepng.cpp
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 302868 (296K) [text/plain]
Saving to: ‘lodepng.cpp’


2024-11-28 14:12:40 (9.01 MB/s) - ‘lodepng.cpp’ saved [302868/302868]

--2024-11-28 14:12:40--  https://raw.githubusercontent.com/lvandeve/lodepng/refs/heads/master/lodepng.h
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102668 (100K) [text/plain]
Saving to: ‘lodepng.h’


2024-11-28 14:12:41 (5.70 MB/s) - ‘lodepng.h’ saved [



In [8]:
from PIL import Image
img = Image.open('totk.jpg')
img.save('totk.png')

In [43]:
%%writefile image.cpp
// usually they are .cu, but using nvcc custom param so we can get syntax coloring here

#include <stdio.h>
#include <iostream>
#include <math.h>
#include "lodepng.h"

#define cudaAssert(ans) { cudaAssertLine((ans), __FILE__, __LINE__); }
inline void cudaAssertLine(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"cudaAssert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

// Kernel function to add the elements of two arrays
__global__
void grayscale(uint8_t* rawImage, uint32_t width, uint32_t height) {

  uint32_t pixelX = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t pixelY = blockIdx.y * blockDim.y + threadIdx.y;


  if (pixelX < width && pixelY < height) {
    uint8_t* pixel = rawImage + ((width * pixelY + pixelX) * 4);
    uint8_t grayscale_value = (uint8_t)(((float)pixel[0]) * 0.299 + ((float)pixel[1]) * 0.587 + ((float)pixel[2]) * 0.114);
    pixel[0] = grayscale_value;
    pixel[1] = grayscale_value;
    pixel[2] = grayscale_value;
  }

}

int main(void) {

  printf("loading image\n");

  // Load PNG file - Allocate host memory & Initialize host data

  uint8_t* pBytes = nullptr;
  uint32_t width        = 0;
  uint32_t height       = 0;
  uint32_t decodeRes    = lodepng_decode_file(&pBytes, &width, &height, "totk.png", LCT_RGBA, 8);

  if (decodeRes != 0) {
      std::cerr <<": loading png file failed. Error= "<< lodepng_error_text(decodeRes) << "(" << decodeRes << ") \n";
      exit(-1);
  };

  printf("image size %dx%d\n", width, height);

  uint32_t buffSize = width*height*4;

  // Allocate device memory.
  uint8_t* rawImage;
  cudaAssert(cudaMalloc(&rawImage, buffSize));

  printf("copying image to device\n");

  // Transfer data from the host to the device.
  cudaAssert(cudaMemcpy(rawImage, pBytes, buffSize, cudaMemcpyHostToDevice));


  printf("running\n");

  // dimensions: 32x32 = 1024 threads per block (max)
  // blocks are integer division rounding up
  dim3 numBlocks((width - 1) / 32 + 1, (height - 1) / 32 + 1);
  // Execute kernel
  grayscale<<<numBlocks, dim3(32, 32)>>>(rawImage, width, height);

  // Wait for GPU to finish before accessing on host
  cudaAssert(cudaDeviceSynchronize());

  printf("copying result\n");

  // Transfer results
  cudaAssert(cudaMemcpy(pBytes, rawImage, buffSize, cudaMemcpyDeviceToHost));

  cudaAssert(cudaDeviceSynchronize());

  // Free device memory
  cudaAssert(cudaFree(rawImage));

  printf("saving file\n");

  uint32_t res = lodepng_encode_file("out.png", pBytes, width, height, LCT_RGBA, 8);
  if (res != 0) {
      std::cerr << " saveImage: lodepng_encode_file returned error= "
                              << lodepng_error_text(res) << "(" << res << ")\n";
      exit(-1);
  };

  return 0;
}



Overwriting image.cpp


In [44]:
%%shell

nvcc -x cu image.cpp lodepng.cpp -o image
# nvprof ./image
./image

loading image
image size 7680x4320
copying image to device
running
copying result
saving file




In [45]:
from PIL import Image
img = Image.open('out.png')
img.save('out.jpg')