In [16]:
import math
import numpy as np
import cupy
from cupyx.profiler import benchmark

# def histogram(input_array, output_array):
#     for item in input_array:
#         output_array[item] = output_array[item] + 1

def np_histogram(input_array, bins):
    hist, _ = np.histogram(input_array, bins=bins)
    return hist

def cp_histogram(input_array, bins):
    hist, _ = cupy.histogram(input_array, bins=bins)
    return hist

# input size
size = 2**25

# allocate memory on CPU and GPU
input_gpu = cupy.random.randint(256, size=size, dtype=cupy.int32)
input_cpu = cupy.asnumpy(input_gpu)
output_gpu = cupy.zeros(256, dtype=cupy.int32)
output_cpu = cupy.asnumpy(output_gpu)

# CUDA code
histogram_cuda_code = r'''
extern "C"
__global__ void histogram(const int * input, int * output)
{
    int item = (blockIdx.x * blockDim.x) + threadIdx.x;
    __shared__ int temp_histogram[256];
 
    // Initialize shared memory and synchronize
    temp_histogram[threadIdx.x] = 0;
    __syncthreads();

    // Compute shared memory histogram and synchronize
    atomicAdd(&(temp_histogram[input[item]]), 1);
    __syncthreads();

    // Update global histogram
    atomicAdd(&(output[threadIdx.x]), temp_histogram[threadIdx.x]);
}
'''

# compile and setup CUDA code
histogram_gpu = cupy.RawKernel(histogram_cuda_code, "histogram")
threads_per_block = 256
grid_size = (int(math.ceil(size / threads_per_block)), 1, 1)
block_size = (threads_per_block, 1, 1)

# check correctness
# histogram(input_cpu, output_cpu)
hist = np_histogram(input_cpu, 256)
histogram_gpu(grid_size, block_size, (input_gpu, output_gpu))
cp_hist = cp_histogram(input_gpu, 256)

if np.allclose(hist, output_gpu):
    if np.allclose(cp_hist.get(), hist):
        print("Correct results!")
else:
    print("Wrong results!")

# measure performance

# print("Timing naive implementation")
# %timeit -n 1 -r 1 histogram(input_cpu, output_cpu)

numpy_hist = benchmark(np_histogram, (input_cpu, 256), n_repeat=1)
print(f"\nNumpy average time: {numpy_hist.to_str()}")

execution_gpu = benchmark(histogram_gpu, (grid_size, block_size, (input_gpu, output_gpu)), n_repeat=10)
execution_cupy_hist = benchmark(cp_histogram, (input_gpu, 256), n_repeat=10)
print(f"\nGPU's raw kernel average time: {execution_gpu}")
print(f"GPU's cupy hist average time: {execution_cupy_hist}")

# gpu_avg_time = np.average(execution_gpu.gpu_times)
rawK_avg = np.average(execution_gpu.cpu_times)
cupy_avg = np.average(execution_cupy_hist.cpu_times)
print(f"\ntime spent on CPU for raw kernel: {rawK_avg*1e3} ms")
print(f"times spent on CPU for cupy hist: {cupy_avg*1e3} ms")

rawK_avg_gpu = np.average(execution_gpu.gpu_times)
cupy_avg_gpu = np.average(execution_cupy_hist.gpu_times)
print(f"\ntimes spent on GPU raw kernel: {rawK_avg_gpu*1e3} ms")
print(f"times spent on GPU cupy hist: {cupy_avg_gpu*1e3} ms")

Correct results!

Numpy average time: np_histogram        :    CPU: 216648.786 us

GPU's raw kernel average time: histogram           :    CPU:     5.841 us   +/-  3.408 (min:     3.416 / max:    14.758) us     GPU-0:  1564.963 us   +/-  6.028 (min:  1559.552 / max:  1580.032) us
GPU's cupy hist average time: cp_histogram        :    CPU:  1762.620 us   +/- 338.493 (min:  1478.171 / max:  2318.047) us     GPU-0:  7092.928 us   +/- 499.151 (min:  6714.368 / max:  8020.992) us

time spent on CPU for raw kernel: 0.005840882658958435 ms
times spent on CPU for cupy hist: 1.7626198707148433 ms

times spent on GPU raw kernel: 1.5649631738662717 ms
times spent on GPU cupy hist: 7.0929279804229735 ms
