# CUDA Setup

In [1]:
!nvidia-smi

Tue Nov  5 17:55:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [4]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [5]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpvc9blf4i".



## Find the transpose of the matrix using Shared memory in GPU

In [6]:
%%writefile qtranpose.cu
#include <iostream>
#include <cuda.h>

#define TILE_DIM 32

__global__ void matrixTransposeShared(float *input, float *output, int width, int height) {
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];

    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = input[y * width + x];
    }
    __syncthreads();
    int transposedX = blockIdx.y * TILE_DIM + threadIdx.x;
    int transposedY = blockIdx.x * TILE_DIM + threadIdx.y;

    if (transposedX < height && transposedY < width) {
        output[transposedY * height + transposedX] = tile[threadIdx.x][threadIdx.y];
    }
}

int main() {
    int width = 1024;
    int height = 1024;
    int size = width * height * sizeof(float);

    float *h_input = new float[width * height];
    float *h_output = new float[width * height];
    for (int i = 0; i < width * height; ++i) {
        h_input[i] = static_cast<float>(i);
    }

    float *d_input, *d_output;
    cudaMalloc((void**)&d_input, size);
    cudaMalloc((void**)&d_output, size);

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    dim3 blockSize(TILE_DIM, TILE_DIM);
    dim3 gridSize((width + TILE_DIM - 1) / TILE_DIM, (height + TILE_DIM - 1) / TILE_DIM);
    matrixTransposeShared<<<gridSize, blockSize>>>(d_input, d_output, width, height);

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
    std::cout << "Output matrix (transposed): " << std::endl;
    for (int i = 0; i < 5; ++i) {
        for (int j = 0; j < 5; ++j) {
            std::cout << h_output[i * width + j] << " ";
        }
        std::cout << std::endl;
    }

    cudaFree(d_input);
    cudaFree(d_output);
    delete[] h_input;
    delete[] h_output;

    return 0;
}


Writing qtranpose.cu


In [7]:
!nvcc qtranpose.cu -o qtranpose

In [8]:
!./qtranpose

Output matrix (transposed): 
0 1024 2048 3072 4096 
1 1025 2049 3073 4097 
2 1026 2050 3074 4098 
3 1027 2051 3075 4099 
4 1028 2052 3076 4100 


## Write a CUDA C kernel that performs an array's sum using shared memory.

In [9]:
%%writefile qarraysum.cu
#include <iostream>
#include <cuda.h>

__global__ void arraySumShared(int *arr, int *result, int n) {
    extern __shared__ int sharedData[];

    int tid = threadIdx.x;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    sharedData[tid] = (index < n) ? arr[index] : 0;
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (tid < stride) {
            sharedData[tid] += sharedData[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        atomicAdd(result, sharedData[0]);
    }
}

int main() {
    int n = 1024;
    int *h_arr = new int[n];
    int *h_result = new int;
    *h_result = 0;
    for (int i = 0; i < n; ++i) {
        h_arr[i] = 1;
    }

    int *d_arr, *d_result;
    cudaMalloc((void**)&d_arr, n * sizeof(int));
    cudaMalloc((void**)&d_result, sizeof(int));

    cudaMemcpy(d_arr, h_arr, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, h_result, sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;
    arraySumShared<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_result, n);

    cudaMemcpy(h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Sum of array: " << *h_result << std::endl;


    cudaFree(d_arr);
    cudaFree(d_result);
    delete[] h_arr;
    delete h_result;

    return 0;
}


Writing qarraysum.cu


In [10]:
!nvcc qarraysum.cu -o qarraysum

In [11]:
!./qarraysum

Sum of array: 1024
