<a href="https://colab.research.google.com/github/dongyyyyy/CUDA/blob/master/Week2/matmul.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%writefile cudabasic.cu
#include <stdio.h>
#include <time.h>

__global__ void MatrixMul(int *M, int *N, int *P, int width)
{
    int accu = 0;

    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int i = by * blockDim.y + ty;
    int j = bx * blockDim.x + tx;

    for(int k=0; k<width; k++)
    {
        accu = accu + M[i*width+k]*N[k*width+j];
    }

    P[i*width+j] = accu;
}

void timeCheck(int size)
{
    clock_t start, end;
    double result;
    int i, j, k;
    int *h_A, *h_B, *h_C, *h_gC;
    int *d_A, *d_B, *d_C;

    int sizeByte = sizeof(int)*size*size;
    h_A = (int *) malloc(sizeByte);
    h_B = (int *) malloc(sizeByte);
    h_C = (int *) malloc(sizeByte);
    h_gC = (int *) malloc(sizeByte);

    for(i = 0; i < size*size; i++) h_A[i] = 1;
    for(i = 0; i < size*size; i++) h_B[i] = 2;

    printf("Host Computing Statrs !\n");
    start = clock();
    for(i = 0; i < size; i++)
        for(j = 0; j < size; j++) {
            h_C[i*size+j] = 0;
            for(k = 0; k < size; k++)
                h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j];
        }
    end = clock();
    result = (double)(end - start)/CLOCKS_PER_SEC;
    printf("CPU Time : %lf\n",result);
    printf("Host Computing Finished !\n");
    /*
    for(i = 0; i < size; i++) {
        for(j = 0; j < size; j++)
          printf("%d ", h_C[i*size+j]);
        printf("\n");
    }
    */
    cudaMalloc(&d_A, sizeByte);
    cudaMalloc(&d_B, sizeByte);
    cudaMalloc(&d_C, sizeByte);

    cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice);


    printf("GPU Computing Statrs !\n");
    start = clock();
    dim3 blocks(size/16, size/16);// size = matrix size ( 1024 / 2048 )
    dim3 threads(16, 16);
    MatrixMul<<<blocks, threads >>>(d_A, d_B, d_C, size);
    cudaDeviceSynchronize();
    end = clock();
    result = (double)(end - start)/CLOCKS_PER_SEC;
    printf("GPU Computing Finished !\n");
    printf("GPU Time : %lf\n",result);
    cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost);
    /*
    for(i = 0; i < size; i++) {
        for(j = 0; j < size; j++)
          printf("%d ", h_gC[i*size+j]);
        printf("\n");
    }
    */

    for(i = 0; i < size; i++)
        for(j = 0; j < size; j++)
            if( h_C[i*size+j] != h_gC[i*size+j] ) {
                printf("Error !\n");
                cudaFree(d_A);
                cudaFree(d_B);
                cudaFree(d_C);
                free(h_A);
                free(h_B);
                free(h_C);
                free(h_gC);
                exit(1);
            }

    printf("Success ! \n");
    
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_gC);

}

int main()
{
    printf("1024X1024 matmul\n");
    timeCheck(1024);
    printf("\n\n");
    printf("2048X2048 matmul\n");
    timeCheck(2048);
    return 0;
}


Overwriting cudabasic.cu


In [0]:
!ls

cudabasic  cudabasic.cu  sample_data


In [0]:
!nvcc -o cudabasic cudabasic.cu

In [0]:
!ls

cudabasic  cudabasic.cu  sample_data


In [0]:
!./cudabasic

1024X1024 matmul
Host Computing Statrs !
CPU Time : 11.484223
Host Computing Finished !
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.031977
Success ! 


2048X2048 matmul
Host Computing Statrs !
CPU Time : 106.888172
Host Computing Finished !
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.252153
Success ! 


In [59]:
%%writefile gpuMatmul.cu
#include <stdio.h>
#include <time.h>

int count = 0;
int *h_C;

__global__ void MatrixMul(int *M, int *N, int *P, int width)
{
    int accu = 0;

    // Block index
    int bx = blockIdx.x; // blockIdx = CUDA block의 인덱스 
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x; // threadIdx = CUDA thread의 인덱스
    int ty = threadIdx.y;

    int i = by * blockDim.y + ty; // blockDime = CUDA block의 크기
    int j = bx * blockDim.x + tx;

    for(int k=0; k<width; k++)
    {
        accu = accu + M[i*width+k]*N[k*width+j]; // d_A * d_B
    }

    P[i*width+j] = accu; // D_c[index] = matmul(d_A,d_B)
}



void timeCheck(int size,int threadSize)
{
    clock_t start, end;
    double result;
    int i, j,k;
    int *h_A, *h_B, *h_gC;
    int *d_A, *d_B, *d_C;

    int sizeByte = sizeof(int)*size*size;
    h_A = (int *) malloc(sizeByte);
    h_B = (int *) malloc(sizeByte);
    h_C = (int *) malloc(sizeByte);
    h_gC = (int *) malloc(sizeByte);

    for(i = 0; i < size*size; i++) h_A[i] = 1;
    for(i = 0; i < size*size; i++) h_B[i] = 2;

    if(size == 1024 && count == 0){
    printf("Host Computing Statrs !\n");
    start = clock();
    for(i = 0; i < size; i++)
        for(j = 0; j < size; j++) {
            h_C[i*size+j] = 0;
            for(k = 0; k < size; k++)
                h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j];
        }
    end = clock();
    result = (double)(end - start)/CLOCKS_PER_SEC;
    printf("CPU Time : %lf\n",result);
    printf("Host Computing Finished !\n");
    count++;
    }
    cudaMalloc(&d_A, sizeByte);
    cudaMalloc(&d_B, sizeByte);
    cudaMalloc(&d_C, sizeByte);

    cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice);


    printf("GPU Computing Statrs !\n");
    start = clock();
    dim3 blocks(size/threadSize, size/threadSize); // size = matrix size ( 1024 / 2048 )
    dim3 threads(threadSize, threadSize);
    MatrixMul<<<blocks, threads >>>(d_A, d_B, d_C, size);
    cudaDeviceSynchronize();
    end = clock();
    result = (double)(end - start)/CLOCKS_PER_SEC;
    printf("GPU Computing Finished !\n");
    printf("GPU Time : %lf\n",result);
    cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost);
    if(size == 1024)
    {
    for(i = 0; i < size; i++)
        for(j = 0; j < size; j++)
            if( h_C[i*size+j] != h_gC[i*size+j] ) {
                printf("Error !\n");
                cudaFree(d_A);
                cudaFree(d_B);
                cudaFree(d_C);
                free(h_A);
                free(h_B);
                free(h_C);
                free(h_gC);
                exit(1);
            }
    }
    printf("Success ! \n");
    
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_gC);

}

int main()
{
    int i, j, size;
    int sizeByte = sizeof(int)*1024*1024;
    h_C = (int *) malloc(sizeByte);
    for( i = 1 ; i <=3 ; i++)
    {
        if(i == 1)
        {
            size = 1024;
        }
        else if(i == 2)
        {
            size = 2048;
        }
        else
        {
            size = 4096;
        }
        int threads = 16;
        for(j = 1 ; j <= 2 ; j++){
          if(j > 1)
          {
            threads *= 2;      
          }
          printf("%dX%d matmul threads = %d\n",size,size,threads);
          timeCheck(size,threads);
        }
        
    }
    return 0;
}


Overwriting gpuMatmul.cu


In [0]:
!nvcc -o gpuMatmul gpuMatmul.cu

In [61]:
!ls

cudabasic  cudabasic.cu  gpuMatmul  gpuMatmul.cu  sample_data


In [62]:
!./gpuMatmul

1024X1024 matmul threads = 16
Host Computing Statrs !
CPU Time : 11.283803
Host Computing Finished !
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.032183
Success ! 
1024X1024 matmul threads = 32
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.023892
Success ! 
2048X2048 matmul threads = 16
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.239794
Success ! 
2048X2048 matmul threads = 32
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.154283
Success ! 
4096X4096 matmul threads = 16
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 1.385778
Success ! 
4096X4096 matmul threads = 32
GPU Computing Statrs !
GPU Computing Finished !
GPU Time : 0.990057
Success ! 
