# CUDA Setup

In [1]:
!nvidia-smi

Tue Nov  5 05:56:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [3]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [4]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp2n6tgnvs".


# Kernal II Configuration

## erform matrix-matrix multiplication using shared memory

In [8]:
%%writefile qconvolution.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void convolve1D(float *input, float *output, float *kernel, int inputSize, int kernelSize) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int halfKernel = kernelSize / 2;
    if (idx >= inputSize) return;

    float sum = 0;
    for (int j = -halfKernel; j <= halfKernel; ++j) {
        int index = idx + j;
        if (index >= 0 && index < inputSize)
            sum += input[index] * kernel[halfKernel + j];
    }
    output[idx] = sum;
}

void testConfiguration(int N, int threads, int kernelSize) {
    printf("Running with N = %d, Threads per block = %d\n", N, threads);

    float *h_input = (float*)malloc(N * sizeof(float));
    float *h_output = (float*)malloc(N * sizeof(float));
    float h_kernel[] = {0.2, 0.2, 0.2, 0.2, 0.2};

    for (int i = 0; i < N; ++i) h_input[i] = 1.0f;

    float *d_input, *d_output, *d_kernel;
    cudaMalloc(&d_input, N * sizeof(float));
    cudaMalloc(&d_output, N * sizeof(float));
    cudaMalloc(&d_kernel, kernelSize * sizeof(float));

    cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice);

    int numBlocks = (N + threads - 1) / threads;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    convolve1D<<<numBlocks, threads>>>(d_input, d_output, d_kernel, N, kernelSize);
    cudaDeviceSynchronize();


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Time taken for N = %d, Threads = %d: %f ms\n", N, threads, milliseconds);

    cudaMemcpy(h_output, d_output, N * sizeof(float), cudaMemcpyDeviceToHost);


    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_kernel);
    free(h_input);
    free(h_output);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

int main() {
    const int kernelSize = 5;
    int Ns[] = {1024, 2048, 8192}; // Different input sizes
    int threads[] = {512, 1024, 2048}; // Different thread configurations

    for (int i = 0; i < 3; ++i) {
        for (int j = 0; j < 3; ++j) {
            testConfiguration(Ns[i], threads[j], kernelSize);
        }
    }

    return 0;
}


Writing qconvolution.cu


In [9]:
!nvcc qconvolution.cu -o qconvolution

In [10]:
!./qconvolution

Running with N = 1024, Threads per block = 512
Time taken for N = 1024, Threads = 512: 42.205185 ms
Running with N = 1024, Threads per block = 1024
Time taken for N = 1024, Threads = 1024: 0.018240 ms
Running with N = 1024, Threads per block = 2048
Time taken for N = 1024, Threads = 2048: 0.023072 ms
Running with N = 2048, Threads per block = 512
Time taken for N = 2048, Threads = 512: 0.016128 ms
Running with N = 2048, Threads per block = 1024
Time taken for N = 2048, Threads = 1024: 0.016384 ms
Running with N = 2048, Threads per block = 2048
Time taken for N = 2048, Threads = 2048: 0.009920 ms
Running with N = 8192, Threads per block = 512
Time taken for N = 8192, Threads = 512: 0.016576 ms
Running with N = 8192, Threads per block = 1024
Time taken for N = 8192, Threads = 1024: 0.017376 ms
Running with N = 8192, Threads per block = 2048
Time taken for N = 8192, Threads = 2048: 0.008160 ms
