In [12]:
!nvidia-smi

Thu Dec 25 12:03:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [18]:
%%writefile multiply_scale.cu

#include <iostream>
#include <cuda_runtime_api.h>
#include <memory.h>
#include <cstdlib>
#include <ctime>
#include <stdio.h>
#include <cmath>
#include <chrono>

__global__ void mul_scale(float* A, float* B, float n, float* C, int vectorLength){
  int workIndex=blockIdx.x*blockDim.x+threadIdx.x;
  if(workIndex<vectorLength){
    C[workIndex]=A[workIndex]*B[workIndex]*n;
  }
}


void initArray(float* A, int length){
  std::srand(std::time(0));
  for(int i=0; i<length; i++){
    A[i]=std::rand()/(float)RAND_MAX;
  }
}

void serialMulScale(float* A, float* B, float* C, int length, float n){
for(int i=0; i<length; i++){
C[i] = A[i] * B[i] *n;
}
}

bool vectorApproximatelyEqual(float* A, float* B, int length, float epsilon=1e-5f){
for(int i=0; i<length; i++){
if(fabs(A[i]-B[i]) > epsilon){
printf("Index %d mismatch: %f != %f", i, A[i], B[i]);
return false;
}
}
return true;
}

void memory(int vectorLength, float n, int threads){
  float* A, *B, *C;
  cudaMallocHost(&A, vectorLength*sizeof(float));
  cudaMallocHost(&B, vectorLength*sizeof(float));
  cudaMallocHost(&C, vectorLength*sizeof(float));

  float* devA, *devB, *devC;
  cudaMalloc(&devA, vectorLength*sizeof(float));
  cudaMalloc(&devB, vectorLength*sizeof(float));
  cudaMalloc(&devC, vectorLength*sizeof(float));

  initArray(A, vectorLength);
  initArray(B, vectorLength);

  cudaMemcpy(devA, A, vectorLength*sizeof(float), cudaMemcpyDefault);
  cudaMemcpy(devB, B, vectorLength*sizeof(float), cudaMemcpyDefault);
  cudaMemset(devC, 0, vectorLength*sizeof(float));

  int blocks = (vectorLength + threads-1)/threads;
  auto start = std::chrono::high_resolution_clock::now();
  mul_scale<<<blocks, threads>>>(devA, devB, n, devC, vectorLength);
  cudaDeviceSynchronize();
  cudaMemcpy(C, devC, vectorLength*sizeof(float), cudaMemcpyDefault);
  auto end = std::chrono::high_resolution_clock::now();
  double time_ms =std::chrono::duration<double, std::milli>(end - start).count();

  std::cout << "Time: " << time_ms << " ms\n";
  std::cout << blocks*threads <<"\n";

  float* comparisonResult = new float[vectorLength]();
  serialMulScale(A, B, comparisonResult, vectorLength, n);

  std::cout << vectorApproximatelyEqual(comparisonResult, C, vectorLength) << "\n";
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  cudaFree(devA);
  cudaFree(devB);
  cudaFree(devC);
}

int main(int argc, char** argv){
  int vectorLength=1024;
  int n=0.1;
  if(argc>2){
    vectorLength=std::atoi(argv[1]);
    n=std::atof(argv[2]);
  }
  int threads[4]={32, 128, 256, 512};
  for (int i=0; i<4; i++){
    memory(vectorLength, n, threads[i]);
  }
  return 0;
}

Overwriting multiply_scale.cu


In [19]:
!nvcc multiply_scale.cu \
  --generate-code arch=compute_75,code=sm_75 \
  -o multscale

In [20]:
!./multscale 1000

Time: 0.113866 ms
1024
1
Time: 0.027627 ms
1024
1
Time: 0.026459 ms
1024
1
Time: 0.026743 ms
1024
1


In [21]:
!./multscale 100000

Time: 0.134554 ms
1024
1
Time: 0.032633 ms
1024
1
Time: 0.02946 ms
1024
1
Time: 0.030632 ms
1024
1


In [22]:
!./multscale 10000000

Time: 0.102294 ms
1024
1
Time: 0.028327 ms
1024
1
Time: 0.027406 ms
1024
1
Time: 0.027839 ms
1024
1


In [25]:
import numpy as np
import time

a=np.random.rand(10000000)
b=np.random.rand(10000000)
n=0.1
start = time.perf_counter()
c=np.multiply(a, b)*n
end = time.perf_counter()
time_ms = (end - start) * 1000
print(f"Time: {time_ms:.3f} ms")

Time: 44.765 ms
