In [1]:
import cv2
import numpy as np
import time
import GPUtil
import platform
import cpuinfo

In [2]:
gpus = GPUtil.getGPUs()
print(cv2.__version__)
print(cpuinfo.get_cpu_info()['brand_raw'])
print(gpus[0].name)

4.5.2
Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
GeForce RTX 2080


# OpenCV without optimization

In [3]:
im = np.random.random((4000, 4000,3)).astype(np.uint8)
loop_cnt = 1000

## CPU

In [4]:
start_t = time.time()
for _ in range(loop_cnt):    
    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    retval,thr = cv2.threshold(gray,128,255,cv2.THRESH_BINARY)
    morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
    morph = cv2.dilate(thr,morph_kernel)
    morph = cv2.resize(morph,(640,480))    
print('cpu time: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))

cpu time: 15007.12 us


### Pre-alloc return arrays and remove constant ops

In [5]:
# use ones from above except morph
morph_sm = np.empty((480,640),np.uint8)
morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
start_t = time.time()
for _ in range(loop_cnt):    
    cv2.cvtColor(im, cv2.COLOR_BGR2GRAY, gray)
    cv2.threshold(gray,128,255,cv2.THRESH_BINARY,thr)    
    cv2.dilate(thr,morph_kernel,morph)
    cv2.resize(morph,(640,480),morph_sm)
cpu_time = (time.time() - start_t) * 1e6 / loop_cnt
print('cpu time: {:.2f} us'.format(cpu_time))

cpu time: 12000.67 us


## GPU

In [7]:
start_t = time.time()
for _ in range(loop_cnt):
    gpu_frame = cv2.cuda_GpuMat()
    gpu_frame.upload(im)
    gpu_gray = cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY)
    retval,gpu_thr = cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY)
    morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
    morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
    gpu_morph = morph_filter.apply(gpu_thr)
    gpu_morph = cv2.cuda.resize(gpu_morph,(640,480))
    res = gpu_morph.download()
print('gpu time: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))

gpu time: 16019.12 us


# OpenCV with optimization

## Demonstrate warm up

In [6]:
start_t = time.time()
loop_cnt_warm = 5
for _ in range(loop_cnt_warm):
    gpu_frame = cv2.cuda_GpuMat()
    gpu_frame.upload(im)
    gpu_gray = cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY)
    retval,gpu_thr = cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY)
    morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
    morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
    gpu_morph = morph_filter.apply(gpu_thr)
    gpu_morph = cv2.cuda.resize(gpu_morph,(640,480))
    res = gpu_morph.download()
print('gpu time warm up: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt_warm))
start_t = time.time()
for _ in range(loop_cnt):
    gpu_frame = cv2.cuda_GpuMat()
    gpu_frame.upload(im)
    gpu_gray = cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY)
    retval,gpu_thr = cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY)
    morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
    morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
    gpu_morph = morph_filter.apply(gpu_thr)
    gpu_morph = cv2.cuda.resize(gpu_morph,(640,480))
    res = gpu_morph.download()
print('gpu time when warm: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))

gpu time warm up: 116695.36 us
gpu time when warm: 16002.82 us


### Exlude CPU ops and upload/dload

In [8]:
morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
gpu_frame = cv2.cuda_GpuMat(im)
start_t = time.time()
for _ in range(loop_cnt):
    gpu_gray = cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY)
    retval,gpu_thr = cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY)
    morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
    gpu_morph = morph_filter.apply(gpu_thr)
    gpu_morph = cv2.cuda.resize(gpu_morph,(640,480))    
print('gpu time: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))
res = gpu_morph.download()

gpu time: 7100.30 us


### Pre-allocate

In [9]:
morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
gpu_frame = cv2.cuda_GpuMat(im)
gpu_gray = cv2.cuda_GpuMat(im.shape[:-1][::-1],cv2.CV_8UC1)
gpu_thr = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph_sm = cv2.cuda_GpuMat((640,480),gpu_gray.type())
start_t = time.time()
for _ in range(loop_cnt):
    cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY,gpu_gray)
    cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY,gpu_thr)
    morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
    morph_filter.apply(gpu_thr,gpu_morph)
    cv2.cuda.resize(gpu_morph,(640,480),gpu_morph_sm)    
print('gpu time: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))
res = gpu_morph.download()

gpu time: 4493.87 us


### Remove morph_filter creation as this would also be pre-computed in practice

In [10]:
morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
gpu_frame = cv2.cuda_GpuMat(im)
gpu_gray = cv2.cuda_GpuMat(im.shape[:-1][::-1],cv2.CV_8UC1)
gpu_thr = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph_sm = cv2.cuda_GpuMat((640,480),gpu_gray.type())
morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
start_t = time.time()
for _ in range(loop_cnt):
    cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY,gpu_gray)
    cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY,gpu_thr)    
    morph_filter.apply(gpu_thr,gpu_morph)
    cv2.cuda.resize(gpu_morph,(640,480),gpu_morph_sm)    
print('gpu time: {:.2f} us'.format((time.time() - start_t) * 1e6 / loop_cnt))
res = gpu_morph.download()

gpu time: 3620.95 us


### Stream - worst case scenario stall on each loop iteration

In [11]:
stream = cv2.cuda_Stream()
morph_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(7,7))
gpu_frame = cv2.cuda_GpuMat(im)
gpu_gray = cv2.cuda_GpuMat(im.shape[:-1][::-1],cv2.CV_8UC1)
gpu_thr = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph = cv2.cuda_GpuMat(gpu_gray.size(),gpu_gray.type())
gpu_morph_sm = cv2.cuda_GpuMat((640,480),gpu_gray.type())
morph_filter = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE,cv2.CV_8U,morph_kernel)
start_t = time.time()
for _ in range(loop_cnt):
    cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2GRAY,gpu_gray,stream = stream)
    cv2.cuda.threshold(gpu_gray,128,255,cv2.THRESH_BINARY,gpu_thr,stream=stream)    
    morph_filter.apply(gpu_thr,gpu_morph,stream=stream)
    cv2.cuda.resize(gpu_morph,(640,480),gpu_morph_sm,stream=stream)
    stream.waitForCompletion()
gpu_time = (time.time() - start_t) * 1e6 / loop_cnt
print('gpu time: {:.2f} us'.format(gpu_time))
res = gpu_morph.download()

gpu time: 3443.38 us


In [26]:
# No real improvement from streams likely hard sync inside one of the routines

# Speed up

In [12]:
cpu_time/gpu_time

3.485140688715614