# CUDA Setup

In [1]:
!nvidia-smi

Tue Nov  5 05:56:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [3]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [4]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp2n6tgnvs".


# Kernal I Configuration

## Find the transpose of the matrix using Shared memory in GPU

In [5]:
%%writefile qtranpose.cu
#include <stdio.h>
#include <ctime>

#define N 1024
#define DIM 32

__global__ void transpose(int *org, int *trans) {
    __shared__ int shared[DIM][DIM];
    int x = blockIdx.x * DIM + threadIdx.x;
    int y = blockIdx.y * DIM + threadIdx.y;
    if (x < N && y < N) {
        shared[threadIdx.y][threadIdx.x] = org[y * N + x];
    }
    __syncthreads();
    if (x < N && y < N) {
        trans[x * N + y] = shared[threadIdx.x][threadIdx.y];
    }
}

void transposeCPU(int *org, int *trans) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            trans[j * N + i] = org[i * N + j];
        }
    }
}

int main() {
    int *h_org, *h_trans;
    int *d_org, *d_trans;

    h_org = (int*)malloc(N * N * sizeof(int));
    h_trans = (int*)malloc(N * N * sizeof(int));
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            h_org[i * N + j] = i * N + j;
        }
    }
    cudaMalloc((void**)&d_org, N * N * sizeof(int));
    cudaMalloc((void**)&d_trans, N * N * sizeof(int));
    cudaMemcpy(d_org, h_org, N * N * sizeof(int), cudaMemcpyHostToDevice);
    int block_size = 32;
    dim3 blockDim(block_size, block_size);
    dim3 gridDim((N + block_size - 1) / block_size, (N + block_size - 1) / block_size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);


    transpose<<<gridDim, blockDim>>>(d_org, d_trans);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);


    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("CUDA Time for N=%d, Threads=%d: %.2f ms\n", N, block_size * block_size, milliseconds);


    cudaMemcpy(h_trans, d_trans, N * N * sizeof(int), cudaMemcpyDeviceToHost);


    cudaFree(d_org);
    cudaFree(d_trans);


    clock_t cpu_start = clock();
    transposeCPU(h_org, h_trans);
    clock_t cpu_end = clock();
    double cpu_time = ((double)(cpu_end - cpu_start)) / CLOCKS_PER_SEC * 1000.0;
    printf("CPU Time for N=%d: %.2f ms\n", N, cpu_time);


    printf("Original Matrix (5x5 subset):\n");
    for (int i = 0; i < 5; i++) {
        for (int j = 0; j < 5; j++) {
            printf("%d ", h_org[i * N + j]);
        }
        printf("\n");
    }

    printf("\nTransposed Matrix (5x5 subset):\n");
    for (int i = 0; i < 5; i++) {
        for (int j = 0; j < 5; j++) {
            printf("%d ", h_trans[i * N + j]);
        }
        printf("\n");
    }


    free(h_org);
    free(h_trans);

    return 0;
}


Writing qtranpose.cu


In [6]:
!nvcc qtranpose.cu -o qtranpose

In [7]:
!./qtranpose

CUDA Time for N=1024, Threads=1024: 134.83 ms
CPU Time for N=1024: 19.57 ms
Original Matrix (5x5 subset):
0 1 2 3 4 
1024 1025 1026 1027 1028 
2048 2049 2050 2051 2052 
3072 3073 3074 3075 3076 
4096 4097 4098 4099 4100 

Transposed Matrix (5x5 subset):
0 1024 2048 3072 4096 
1 1025 2049 3073 4097 
2 1026 2050 3074 4098 
3 1027 2051 3075 4099 
4 1028 2052 3076 4100 
