<a href="https://colab.research.google.com/github/devgokulbv/CUDA/blob/main/CUDA_in_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install CUDA C++ plugin for Colab:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpot9bqey1".


In [2]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

GPU Name       : Tesla T4
Architecture   : sm_75


In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

__global__ void hello_kernel() {
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    int globalId = threadId + blockId * blockDim.x;

    printf("Hello from block %d, thread %d (global thread %d)\n", blockId, threadId, globalId);
}

int main() {
    int numBlocks = 2;
    int threadsPerBlock = 4;

    hello_kernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();

    return 0;
}

Hello from block 0, thread 0 (global thread 0)
Hello from block 0, thread 1 (global thread 1)
Hello from block 0, thread 2 (global thread 2)
Hello from block 0, thread 3 (global thread 3)
Hello from block 1, thread 0 (global thread 4)
Hello from block 1, thread 1 (global thread 5)
Hello from block 1, thread 2 (global thread 6)
Hello from block 1, thread 3 (global thread 7)



In [8]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>
#include <cuda.h>
#define row 3
#define col 3

__global__ void kernel(float *c,float *a, float *b, int n)
{
    int global_id = blockDim.x * blockIdx.x + threadIdx.x;
    if (global_id < n)
    {
        c[global_id] = a[global_id] + b[global_id];
    }
}
int main()
{
    float a[row][col], b[row][col], c[row][col];
    float *d_a, *d_b, *d_c;
    int size = row * col * sizeof(float);
    int num = row * col;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    for (int i = 0; i < row; i++)
    {
        for (int j = 0; j < col; j++)
        {
            a[i][j] = rand() % 100;
            b[i][j] = rand() % 100;
            printf("%f ", a[i][j]);

        }
        printf("\n");
    }
    printf("\n");
    for (int i = 0; i < row; i++)
    {
        for (int j = 0; j < col; j++)
        {

            printf("%f ", b[i][j]);
        }
        printf("\n");
    }
    printf("\n");
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    kernel<<<3, 3>>>(d_c, d_a, d_b, num);
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    for (int i = 0; i < row; i++)
    {
        for (int j = 0; j < col; j++)
        {
            printf("%f ", c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}


83.000000 77.000000 93.000000 
86.000000 49.000000 62.000000 
90.000000 63.000000 40.000000 

86.000000 15.000000 35.000000 
92.000000 21.000000 27.000000 
59.000000 26.000000 26.000000 

169.000000 92.000000 128.000000 
178.000000 70.000000 89.000000 
149.000000 89.000000 66.000000 

