In [20]:
%%writefile test.cu
#include <iostream>
#include <cuda_runtime.h>
__global__ void addVectors(int* A, int* B, int* C, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
C[i] = A[i] + B[i];
}
}
int main() {
int n;
std::cout << "Enter the number of elements: ";
std::cin >> n;
int* A = new int[n];
int* B = new int[n];
int* C = new int[n];
std::cout << "Enter elements for vector A:" << std::endl;
for (int i = 0; i < n; i++) {
std::cin >> A[i];
}
std::cout << "Enter elements for vector B:" << std::endl;
for (int i = 0; i < n; i++) {
std::cin >> B[i];
}
int size = n * sizeof(int);
int* dev_A, * dev_B, * dev_C;
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);
cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
 addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel launch failed: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }

    cudaDeviceSynchronize(); // Ensure kernel completes
cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
std::cout << "Vector Addition Results:" << std::endl;
for (int i = 0; i < n && i < 10; i++) { //print up to 10 results.
std::cout << C[i] << " ";
}
std::cout << std::endl;
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
delete[] A;
delete[] B;
delete[] C;
return 0;
}




Overwriting test.cu


In [21]:
!nvcc test.cu -o test


!nvcc -arch=sm_75 -o test test.cu


!./test

Enter the number of elements: 5
Enter elements for vector A:
1 2 3 4 5
Enter elements for vector B:
10 20 30 40 50
Vector Addition Results:
11 22 33 44 55 


In [None]:
This CUDA C++ program performs vector addition in parallel using a GPU. It takes two arrays (A and B) of integers from the user, adds them element-wise on the GPU, and prints the result.

🧠 Section-wise Explanation
1. Includes & Setup
cpp
Copy
Edit
#include <iostream>
#include <cuda_runtime.h>
iostream: Standard C++ header for input and output.

cuda_runtime.h: Includes CUDA-specific functions like cudaMalloc, cudaMemcpy, etc.

2. The CUDA Kernel Function
cpp
Copy
Edit
__global__ void addVectors(int* A, int* B, int* C, int n)
__global__: CUDA keyword marking this function as a kernel — callable from host (CPU) and executed on the device (GPU).

int i = blockIdx.x * blockDim.x + threadIdx.x;: This calculates a unique thread index based on:

blockIdx.x: Block number in the grid

blockDim.x: Threads per block

threadIdx.x: Thread number in the block

💡 Concept: Thread Hierarchy

CUDA uses a hierarchy: Grid → Blocks → Threads. Each thread executes the kernel independently.

3. User Input & Memory Allocation
cpp
Copy
Edit
int* A = new int[n];
Uses dynamic memory allocation (heap) to allow runtime-determined size.

Prompts the user to input elements for vectors A and B.

🧠 Concept: Host vs Device Memory

Host memory: regular CPU RAM (e.g., A, B, C).

Device memory: GPU RAM (e.g., dev_A, dev_B, dev_C).

4. Memory Copy (Host to Device)
cpp
Copy
Edit
cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
Transfers data from the CPU to GPU.

Required before computation as the GPU cannot directly access host memory.

💡 Concept: cudaMemcpy

cudaMemcpyHostToDevice: Copy from CPU to GPU.

cudaMemcpyDeviceToHost: Copy from GPU to CPU.

5. Kernel Launch
cpp
Copy
Edit
addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);
<<<numBlocks, blockSize>>>: Launch configuration. Specifies how many blocks and threads per block to use.

💡 Formula:

cpp
Copy
Edit
numBlocks = (n + blockSize - 1) / blockSize;
Ensures all n elements are processed, even if n isn't divisible by blockSize.

6. Error Checking and Synchronization
cpp
Copy
Edit
cudaError_t err = cudaGetLastError();
Checks if the kernel was launched correctly.

cudaDeviceSynchronize() waits until the GPU finishes execution before moving on.

💡 Concept: Asynchronous Execution

CUDA kernel launches are asynchronous by default; synchronization ensures predictable behavior.

7. Result Transfer and Cleanup
cpp
Copy
Edit
cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
Brings result back to host.

Only prints first 10 results (for brevity).

Frees device and host memory.

📘 Related Computer Science Concepts
Here are some broader concepts this code relates to:

1. Parallel Computing
Each GPU thread handles a single element of the vectors.

Drastically improves performance for large n.

2. Memory Management
The code demonstrates manual memory management (both host and device), which is crucial in systems programming and performance optimization.

3. Concurrency vs Parallelism
Concurrency is about dealing with multiple tasks at once (not necessarily simultaneous).

Parallelism is actually doing many tasks simultaneously, like GPU threads here.

4. Race Conditions
This code avoids race conditions by giving each thread its own index i.

In more complex scenarios, care must be taken to prevent multiple threads writing to the same memory.

5. Hardware Abstraction
CUDA abstracts GPU architecture while still exposing thread-level control.

You must understand both software and hardware to optimize performance.