In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpwmolcrct".


In [50]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <chrono>
#include <cassert>

#define BLOCK_SIZE 32 // Dimensiunea unui bloc CUDA

using namespace std;

// Kernel CUDA pentru aplicarea unui filtru 3x3
__global__ void applyFilter3x3(int* in, int* out, int rows, int cols) {
    // Calculăm poziția absolută a thread-ului în matrice
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Procesăm toate celulele, inclusiv marginile (duplicăm valori la margini)
    if (row < rows && col < cols){
      int value = 0;

      for (int i = -1; i <= 1; ++i) {
          for (int j = -1; j <= 1; ++j) {
              int r = min(max(row + i, 0), rows - 1);
              int c = min(max(col + j, 0), cols - 1);
              value += -1 * in[r * cols + c];
          }
      }

      // Stocăm rezultatul în matricea de ieșire
      out[row * cols + col] = value;
    }
}

void initializeAndWriteMatrixToFile(const char* filename, int rows, int cols) {
    ofstream file(filename);
    if (!file) {
        cerr << "Error: Unable to open file for writing: " << filename << endl;
        return;
    }

    file << rows << " " << cols << endl;
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            int value = rand() % 256;
            file << value << " ";
        }
        file << endl;
    }

    file.close();
}

void readMatrixFromFile(const char* filename, int* matrix, int rows, int cols) {
    ifstream file(filename);
    if (!file) {
        cerr << "Error: Unable to open file for reading: " << filename << endl;
        return;
    }

    int fileRows, fileCols;
    file >> fileRows >> fileCols;

    if (fileRows != rows || fileCols != cols) {
        cerr << "Error: Matrix dimensions in file do not match expected dimensions." << endl;
        file.close();
        return;
    }

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            file >> matrix[i * cols + j];
        }
    }

    file.close();
}

void writeMatrixToFile(const char* filename, int* matrix, int rows, int cols) {
    ofstream file(filename);
    if (!file) {
        cerr << "Error: Unable to open file for writing: " << filename << endl;
        return;
    }

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            file << matrix[i * cols + j] << " ";
        }
        file << endl;
    }

    file.close();
}

// Filtrare secvențială
void applyFilterSequential(int* in, int* out, int rows, int cols) {
    for (int row = 0; row < rows; ++row) {
        for (int col = 0; col < cols; ++col) {
            int value = 0;
            for (int i = -1; i <= 1; ++i) {
                for (int j = -1; j <= 1; ++j) {
                    int r = min(max(row + i, 0), rows - 1);
                    int c = min(max(col + j, 0), cols - 1);
                    value += -1 * in[r * cols + c];
                }
            }
            out[row * cols + col] = value;
        }
    }
}

void applyFilter(const char* inputFilename, const char* outputFilename, int rows, int cols) {
    int size = rows * cols * sizeof(int);

    // Alocăm memorie pentru matrici pe host (CPU)
    int* h_in = (int*)malloc(size);
    int* h_out = (int*)malloc(size);
    int* h_seq_out = (int*)malloc(size);

    // Citim matricea de intrare din fișier
    readMatrixFromFile(inputFilename, h_in, rows, cols);

    // Alocăm memorie pentru matrici pe device (GPU)
    int* d_in;
    int* d_out;
    cudaMalloc(&d_in, size);
    cudaMalloc(&d_out, size);

    // Copiem matricea de intrare de pe host pe device
    cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);

    // Calculăm dimensiunea grilei și a blocurilor
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((rows + blockDim.x - 1) / blockDim.x, (cols + blockDim.y - 1) / blockDim.y);

    auto startTime = chrono::high_resolution_clock::now();

    // Lansăm kernel-ul CUDA
    applyFilter3x3<<<gridDim, blockDim>>>(d_in, d_out, rows, cols);

    auto endTime = chrono::high_resolution_clock::now();
    double elapsed_time_ms = chrono::duration<double, std::milli>(endTime - startTime).count();
    cout << "CUDA time: " << elapsed_time_ms << " ms" << endl;

    // Copiem rezultatul înapoi de pe device pe host
    cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);

    // Scriem matricea filtrată în fișierul de ieșire
    writeMatrixToFile(outputFilename, h_out, rows, cols);

    // Aplicăm filtrul secvențial
    auto seqStartTime = chrono::high_resolution_clock::now();
    applyFilterSequential(h_in, h_seq_out, rows, cols);
    auto seqEndTime = chrono::high_resolution_clock::now();
    double seq_elapsed_time_ms = chrono::duration<double, std::milli>(seqEndTime - seqStartTime).count();
    cout << "Sequential time: " << seq_elapsed_time_ms << " ms" << endl;

    string seqFilename = string("seq") + to_string(rows) + ".txt";
    writeMatrixToFile(seqFilename.c_str(), h_seq_out, rows, cols);

    for (int i = 0; i < rows * cols; ++i) {
        assert(h_out[i] == h_seq_out[i]);
    }

    // Eliberăm memoria
    free(h_in);
    free(h_out);
    free(h_seq_out);
    cudaFree(d_in);
    cudaFree(d_out);
}

int main() {
    const int rows1 = 100, cols1 = 100;
    const int rows2 = 1000, cols2 = 1000;

    // Uncomment the following lines to generate and write matrices to files
    // initializeAndWriteMatrixToFile("matrix100.txt", rows1, cols1);
    // initializeAndWriteMatrixToFile("matrix1000.txt", rows2, cols2);

    // Aplicăm filtrul pe matricea 100x100
    //applyFilter("matrix100.txt", "result100.txt", rows1, cols1);

    // Aplicăm filtrul pe matricea 1000x1000
     applyFilter("matrix1000.txt", "result1000.txt", rows2, cols2);

    return 0;
}


CUDA time: 0.188284 ms
Sequential time: 96.7178 ms

