In [59]:
%%writefile exmem_vecadd.cu
#include <cuda_runtime.h>
#include <memory.h>
#include <cstdlib>
#include <ctime>
#include <stdio.h>
#include <cmath>
#include <iostream>
#include <chrono>

__global__ void vecADD(float* A, float* B, float* C, int vectorLength){
  int workIndex=threadIdx.x+blockIdx.x*blockDim.x;
  if (workIndex<vectorLength){
    C[workIndex]=A[workIndex]+B[workIndex];
  }
}

void initArray(float* A, int length){
  for(int i=0; i<length; i++){
    A[i]=std::rand()/(float)RAND_MAX;
  }
}

void serialVecAdd(float* A, float* B, float* C, int length){
for(int i=0; i<length; i++){
C[i] = A[i] + B[i];
}
}

bool vectorApproximatelyEqual(float* A, float* B, int length, float epsilon=1e-7){
for(int i=0; i<length; i++){
if(fabs(A[i]-B[i]) > epsilon){
printf("Index %d mismatch: %f != %f", i, A[i], B[i]);
return false;
}
}
return true;
}



void explicitMem(int vectorLength, int threads){
  float* A, *B, *C; //pointers for host memory
  float* devA, *devB, *devC; //device memory
  cudaMallocHost(&A, vectorLength*sizeof(float));
  cudaMallocHost(&B, vectorLength*sizeof(float));
  cudaMallocHost(&C, vectorLength*sizeof(float));

  initArray(A, vectorLength);
  initArray(B, vectorLength);

  cudaMalloc(&devA, vectorLength*sizeof(float));
  cudaMalloc(&devB, vectorLength*sizeof(float));
  cudaMalloc(&devC, vectorLength*sizeof(float));

  cudaMemcpy(devA, A, vectorLength*sizeof(float), cudaMemcpyDefault);
  cudaMemcpy(devB, B, vectorLength*sizeof(float), cudaMemcpyDefault);
  cudaMemset(devC, 0, vectorLength*sizeof(float));

  int blocks = (vectorLength + threads-1)/threads;
  auto start = std::chrono::high_resolution_clock::now();
  vecADD<<<blocks, threads>>>(devA, devB, devC, vectorLength);
  cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
    printf("Kernel launch failed: %s\n", cudaGetErrorString(err));
}

cudaDeviceSynchronize();
  cudaMemcpy(C, devC, vectorLength*sizeof(float), cudaMemcpyDefault);
  auto end = std::chrono::high_resolution_clock::now();
  double time_ms =std::chrono::duration<double, std::milli>(end - start).count();

 float* comparisonResult = new float[vectorLength]();
 serialVecAdd(A, B, comparisonResult, vectorLength);

  if(vectorApproximatelyEqual(C, comparisonResult, vectorLength)){
printf("Explicit Memory: CPU and GPU answers match\n");
}
else{
printf("Explicit Memory: Error- CPU and GPU answers to not match\n");
}

  std::cout << "Time: " << time_ms << " ms\n";
  std::cout <<"Total number of threads" <<blocks*threads <<"\n";
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  cudaFree(devA);
  cudaFree(devB);
  cudaFree(devC);
  delete[] comparisonResult;
}




int main(int argc, char** argv){
  int vectorLength=1000;
  std::srand(std::time(nullptr));   //
  if(argc>=2){
    vectorLength=std::atoi(argv[1]);
  }
  int threads[4]={32, 128, 256, 512};
  for (int i=0; i<4; i++){
    explicitMem(vectorLength, threads[i]);
  }
  return 0;
}


Overwriting exmem_vecadd.cu


In [63]:
!nvcc exmem_vecadd.cu \
  -arch=compute_75 \
  -code=sm_75 \
  --cubin \
  -o vecadd
!nvcc exmem_vecadd.cu -o main
!./main 1000

Kernel launch failed: the provided PTX was compiled with an unsupported toolchain.
Index 0 mismatch: 0.000000 != 0.507612Explicit Memory: Error- CPU and GPU answers to not match
Time: 8.00178 ms
Total number of threads1024
Kernel launch failed: the provided PTX was compiled with an unsupported toolchain.
Index 0 mismatch: 0.000000 != 1.115201Explicit Memory: Error- CPU and GPU answers to not match
Time: 0.033612 ms
Total number of threads1024
Kernel launch failed: the provided PTX was compiled with an unsupported toolchain.
Index 0 mismatch: 0.000000 != 1.673212Explicit Memory: Error- CPU and GPU answers to not match
Time: 0.043248 ms
Total number of threads1024
Kernel launch failed: the provided PTX was compiled with an unsupported toolchain.
Index 0 mismatch: 0.000000 != 1.130162Explicit Memory: Error- CPU and GPU answers to not match
Time: 0.029468 ms
Total number of threads1024


Tue Dec 23 05:55:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [20]:
import numpy as np
import time

a=np.random.rand(1000)
b=np.random.rand(1000)
start = time.perf_counter()
c=a+b
end = time.perf_counter()
time_ms = (end - start) * 1000
print(f"Time: {time_ms:.3f} ms")

Time: 0.112 ms
