In [13]:
%%writefile relu.cu
#include <cuda_runtime_api.h>
#include <memory.h>
#include <cstdlib>
#include <ctime>
#include <stdio.h>
#include <cmath>
#include <chrono>
#include <iostream>

__global__ void relu(float* A, float*B, int vectorLength){
  int workIndex=blockIdx.x*blockDim.x+threadIdx.x;
  if(workIndex<vectorLength){
    B[workIndex]=max(0.0f,A[workIndex]);
  }
}

void initArray(float* A, int length){
  std::srand(std::time(0));
  for(int i=0; i<length; i++){
    A[i]=std::rand()/(float)RAND_MAX;
  }
}

void serial_relu(float* A, float* B, int length){
  for(int i=0; i<length; i++){
  B[i] = max(0.0f, A[i]);
}
}

bool vectorApproximatelyEqual(float* A, float* B, int length, float epsilon=1e-5f){
for(int i=0; i<length; i++){
if(fabs(A[i]-B[i]) > epsilon){
printf("Index %d mismatch: %f != %f", i, A[i], B[i]);
return false;
}
}
return true;
}

void memory(int vectorLength, int threads){
  float* A, *B;
  cudaMallocHost(&A, sizeof(float)*vectorLength);
  cudaMallocHost(&B, sizeof(float)*vectorLength);

  initArray(A, vectorLength);

  float* devA, *devB;
  cudaMalloc(&devA, sizeof(float)*vectorLength);
  cudaMalloc(&devB, sizeof(float)*vectorLength);

  cudaMemcpy(devA, A, sizeof(float)*vectorLength, cudaMemcpyDefault);
  cudaMemset(devB, 0, sizeof(float)*vectorLength);

  int blocks = (vectorLength + threads-1)/threads;

  auto start=std::chrono::high_resolution_clock::now();
  relu<<<blocks, threads>>>(devA, devB, vectorLength);
  cudaDeviceSynchronize();
  cudaMemcpy(B, devB, sizeof(float)*vectorLength, cudaMemcpyDefault);
  auto end=std::chrono::high_resolution_clock::now();

  double time_ms =std::chrono::duration<double, std::milli>(end - start).count();

  std::cout << "Time: " << time_ms << " ms\n";
  std::cout << blocks*threads <<"\n";

  float* comparisonResult = new float[vectorLength]();
  serial_relu(A, comparisonResult, vectorLength);

  std::cout << vectorApproximatelyEqual(comparisonResult, B, vectorLength) << "\n";
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFree(devA);
  cudaFree(devB);
}

int main(int argc, char** argv){
  int vectorLength=1024;
  if(argc>1){
    vectorLength=std::atoi(argv[1]);
  }
  int threads[4]={32, 128, 256, 512};
  for (int i=0; i<4; i++){
    memory(vectorLength, threads[i]);
  }
  return 0;
}


Overwriting relu.cu


In [14]:
!nvcc relu.cu \
  --generate-code arch=compute_75,code=sm_75 \
  -o relu

In [15]:
!./relu 1000

Time: 0.117389 ms
1024
1
Time: 0.027616 ms
1024
1
Time: 0.026067 ms
1024
1
Time: 0.026456 ms
1024
1


In [16]:
!./relu 100000

Time: 0.146804 ms
100000
1
Time: 0.055857 ms
100096
1
Time: 0.056451 ms
100096
1
Time: 0.055909 ms
100352
1


In [17]:
!./relu 10000000

Time: 4.29382 ms
10000000
1
Time: 3.61019 ms
10000000
1
Time: 3.60065 ms
10000128
1
Time: 3.61478 ms
10000384
1


In [None]:
import numpy as np
import time

a=np.random.rand(1024)
b=np.zeros(1024)
start = time.perf_counter()
for i in range(len(a)):
  b[i]=max(0.0,a[i])
end = time.perf_counter()
time_ms = (end - start) * 1000
print(f"Time: {time_ms:.3f} ms")

Time: 0.520 ms
