<a href="https://colab.research.google.com/github/domeano/Thermo-and-Fluid-Engineering-Lab/blob/master/matmul_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%%writefile matrix_mul_gpu.cu
#include<stdio.h>
#include<math.h>
#include<stdlib.h>


__global__ void matrix_mul_gpu(int* A, int* B, int* result, int N)
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    if( col < N && row < N) 
    {
        for(int i = 0; i < N; i++) 
        {
            sum += A[row * N + i] * B[i * N + col];
        }
        result[row * N + col] = sum;
    }
} 

void matrix_mul_cpu(int* A, int* B, int* result, int N)
{
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<N;j++)
        {
            int sum = 0;
            for (int h = 0; h < N; ++h) 
            {
                sum += A[i * N + h] * B[h * N + j];
            }
            result[i * N + j] = sum;
        }
    }
}

void Init(int* A, int* B, int* result_cpu, int* result_gpu, int N)
{
	for (int i = 0; i < N; i++)
  {
      for(int j=0;j<N;j++)
      {
          A[i*N + j]=1;
          B[i*N + j]=1;
          result_cpu[i*N + j]=0;
          result_gpu[i*N + j]=0;
      }
  }
}

int main()
{
    int N = 1000;
    int *A, *B, *result_cpu, *result_gpu;
    bool chk=false;
    const int size = (N*N) * sizeof(int);
    int deviceId;
    float gpu_time_ms, cpu_time_ms;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
 
    cudaGetDevice(&deviceId);
 
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&result_gpu, size);
    result_cpu = (int*)malloc(size);

    cudaMemPrefetchAsync(A, size, cudaCpuDeviceId);
    cudaMemPrefetchAsync(B, size, cudaCpuDeviceId);

    Init(A,B,result_cpu, result_gpu,N);
    
    cudaEventRecord(start, 0);
    matrix_mul_cpu(A,B,result_cpu,N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&gpu_time_ms, start, stop);
    printf("cpu matrix mul 소요시간 : %f ms\n", gpu_time_ms);
    
    cudaMemPrefetchAsync(A, size, deviceId);
    cudaMemPrefetchAsync(B, size, deviceId);
 
        
    dim3 threads_per_block (16, 16, 1); // A 16 x 16 block threads
    dim3 number_of_blocks ((N / threads_per_block.x) + 1, (N / threads_per_block.y) + 1, 1);

    /*
    size_t threads;
    size_t blocks;
    threads = 256;
    blocks = 32 * numberOfSMs;
    */

    cudaEventRecord(start, 0);
    matrix_mul_gpu<<< number_of_blocks, threads_per_block >>> ( A, B, result_gpu, N);

    cudaDeviceSynchronize();
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cpu_time_ms, start, stop);
    printf("gpu matrix mul 소요시간 : %f ms\n", cpu_time_ms);
    cudaMemPrefetchAsync(result_gpu, size, cudaCpuDeviceId);

    for (int i = 0; i < N; i++)
    {
        for(int j=0;j<N;j++)
        {
            if(result_cpu[i*N + j] != result_gpu[i*N + j])
            {
                chk=true;
                break;
            }
        }
        if(chk)
          break;
    }
    if(chk)
        printf("값 다름");
    else
        printf("값 일치");

    cudaFree(A);
    cudaFree(B);
    cudaFree(result_gpu);
    free(result_cpu);
    return 0;
}

Overwriting matrix_mul_gpu.cu


In [5]:
!nvcc -o matrix_mul_gpu matrix_mul_gpu.cu -run

cpu matrix mul 소요시간 : 4243.914551 ms
gpu matrix mul 소요시간 : 8.680832 ms
값 일치