# Comparisson of [Nvidia Video Codec SDK ](https://developer.nvidia.com/nvidia-video-codec-sdk) with CPU and iGPU ([Quick Sync](https://en.wikipedia.org/wiki/Intel_Quick_Sync_Video)) decoding using OpenCV 4.5.0 with python - CUDA 10.0, Cuda Video Codec SDK 11.0.10 and Ffmpeg 4.3.1 master  99888-g5c7823ff1c-win64-lgpl - GPU Driver 457.30 

To run the h264_cuvid and hevc_cuvid test in the notebook you will need to compile OpenCV against Ffmpeg libs built with --enable-cuda --enable-cuvid.

Notes: 
1. Whilst using VideoCapture with h264_cuvid decoding is only as fast as CPU decoding, it does offload the decoding leaving more CPU resources available.  Additionally this option currently supports far more codecs than cv.cudacodec.VideoReader.
2. GPU codec support depends on the GPU generation, see NVDEC_VideoDecoder_API_ProgGuide.pdf in the [NVidia Video Codec SDK documentation](https://developer.nvidia.com/nvidia-video-codec-sdk) for details.
3. Unfortunately the current QuickSync implementation does not support container formats or RTSP streaming.

## Init

In [1]:
#export
import os
import time
import numpy as np
from functools import partial
import matplotlib.pyplot as plt
import cv2 as cv
import pandas as pd
import psutil

In [2]:
#export
# globals
#vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/768x576.avi'
#vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/1920x1080.avi'
#vid_path = 'rtsp://127.0.0.1/mediafile.264'
#vid_path="rtsp://127.0.0.1/jellyfish-120-mbps-4k-uhd-h264.264";
#vid_path="rtsp://127.0.0.1/big_buck_bunny.264";
# test files from http://jell.yfish.us/
vid_path_264 = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-h264.mkv'
vid_path_265 = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv'

check_res = False

In [3]:
def CheckFrames(f1,f2,epsilon = -1,rows = -1,cols = -1,channels = -1):
    assert len(f1) > 0 and len(f1) == len(f2), f'f1 length {len(f1)}, f2 length {len(f2)}'    
    epsilon = 0 if epsilon == -1 else epsilon
    rows = f1[0].shape[0] if rows == -1 else rows
    cols = f1[0].shape[1] if cols == -1 else cols
    channels = f1[0].shape[2] if channels == -1 and len(f1[0].shape) == 3  else channels    
    for i in range(0,len(f1)):
        assert np.sum(f1[i][:rows,:cols,:channels] != f2[i][:rows,:cols,:channels]) <= epsilon, f'frame {i} different'

In [4]:
%matplotlib inline

In [5]:
#export
def ProcVid0(cap, measure_cpu = True):
    n_frames, start, end, max_cpu, av_cpu, n_cpu_snapshots  = 0,0,0,0,0,0
    if(measure_cpu):
        p = psutil.Process()
        cpu_count = psutil.cpu_count()
    if (cap.IsOpen()== False): # replace with catch
        print("Error opening video stream or file")
        return
    frames_available = True
    start = time.time()    
    while(cap.IsOpen()):
        ret,_ = cap.GetFrame()
        if (measure_cpu):
            cpu_all_pc = p.cpu_percent()
            if(cpu_all_pc > 0):
                n_cpu_snapshots +=1
                cpu_pc = cpu_all_pc/cpu_count
                max_cpu = max(cpu_pc,max_cpu)
                av_cpu += cpu_pc   
        if(ret):
            n_frames += 1 
    end = time.time()
    if(measure_cpu): 
        print(f'CPU utilization - max: {max_cpu:.2f}%, average {av_cpu/(n_cpu_snapshots):.2f}%')
    return (end - start)*1000/n_frames, n_frames;

In [6]:
#export
# host mem not implemented, manually pin memory
class PinnedMem(object):
    def __init__(self, size, dtype=np.uint8):
        self.array = np.empty(size,dtype)
        cv.cuda.registerPageLocked(self.array)
        self.pinned = True
    def __del__(self):
        cv.cuda.unregisterPageLocked(self.array)
        self.pinned = False
    def __repr__(self):
        return f'pinned = {self.pinned}'

In [7]:
#export
class VidCap:
    def __init__(self,vid_path,max_frames = -1,store_res = False,file_to_write=None):
        self.vid_path = vid_path
        self.store_res = store_res
        self.res = []
        self.frame_num = 0
        self.open = False
        cap = cv.VideoCapture(vid_path)
        assert cap.isOpened(), f"{vid_path}: cannot be opened!"
        self.num_frames = cap.get(cv.CAP_PROP_FRAME_COUNT)
        self.max_frames = self.num_frames if max_frames == -1 else max_frames
        ret, frame = cap.read()
        cap.release()
        self.rows,self.cols,self.channels = frame.shape
        self.write_video = False
        if(file_to_write):
            self.write_video = True
            #fourcc = cv.VideoWriter_fourcc(*'H264')
            fourcc = cv.VideoWriter_fourcc('M', '4', 'S', '2')
            self.out = cv.VideoWriter(file_to_write,cv.CAP_FFMPEG,fourcc,25,(self.cols,self.rows))
    
    def UpdateState(self,ret): 
        if (not ret or self.frame_num+1 == self.max_frames): 
            self.open = False            
        if(ret or self.frame_num+1 == self.max_frames):
            self.frame_num += 1
        
    def IsOpen(self): return self.open
    
    def __del__(self):
        if(self.write_video):
            self.out.release()
        
    #def WriteFrame(self,frame):
    #    if(self.file_to_write):
    #        self.out.write(frame)
        
class CudaCap(VidCap):
    def __init__(self,vid_path,max_frames=-1, store_res=False, file_to_write=None):
        VidCap.__init__(self, vid_path, max_frames, store_res, file_to_write)
        # cudacodec always returns 4 channels - check grey video
        self.channels = 4
        # cudacodec seems to need rows/16
        self.rows = (np.ceil(self.rows/16)*16).astype(int)
        self.cap = cv.cudacodec.createVideoReader(self.vid_path)
        self.open = True
        self.frame_device = cv.cuda_GpuMat(self.rows,self.cols,cv.CV_8UC4)
        self.frame_host = PinnedMem((self.rows,self.cols,self.channels))
        self.stream = cv.cuda_Stream()
            
    def GetFrame(self):
        if(self.store_res or self.write_video):
            ret,_ = self.GetHostFrame()
            if(self.write_video):
                self.out.write(self.frame_host.array[:,:,:3])
            if(self.store_res and ret):
                self.res.append(np.copy(self.frame_host.array))
            return ret,self.frame_device
        else:
            return self.GetDeviceFrame()
            
    def GetDeviceFrame(self):
        ret,_ = self.cap.nextFrame(self.frame_device,self.stream)
        self.UpdateState(ret)
        return ret,self.frame_device
        
    def GetHostFrame(self):
        ret,_ = self.GetDeviceFrame()
        if(ret):
            self.frame_device.download(self.frame_host.array)
        return ret,self.frame_host.array
        
    
class CudaCapNpa(CudaCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False, file_to_write=None):
        CudaCap.__init__(self, vid_path, max_frames, store_res, file_to_write)
            
    def GetDeviceFrame(self):        
        ret,self.frame_device = self.cap.nextFrame()
        self.UpdateState(ret)
        return ret,self.frame_device
    
class CpuCap(VidCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False,backend=cv.CAP_ANY):
        VidCap.__init__(self, vid_path, max_frames, store_res)      

        self.cap = cv.VideoCapture(self.vid_path,backend)
        assert self.cap.isOpened(), f"{vid_path}: cannot be opened for backend: {backend}!"
        #if self.cap.isOpened():
        self.open = True
        self.frame = np.empty((self.rows,self.cols,self.channels),np.uint8)
            
    def GetFrame(self):
        ret,_ = self.cap.read(self.frame)
        self.UpdateState(ret)
        if (ret):
            if(self.store_res):
                self.res.append(np.copy(self.frame))
        return ret,self.frame
                
    def __del__(self):
        self.cap.release()
        
class CpuCapNpa(CpuCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False,backend=cv.CAP_ANY):
        CpuCap.__init__(self, vid_path, max_frames, store_res,backend)      
            
    def GetFrame(self):
        ret,self.frame = self.cap.read()
        self.UpdateState(ret)
        if (ret):
            if(self.store_res):
                self.res.append(np.copy(self.frame))
        return ret,self.frame
    

<a id="cpu"></a>

## CPU

### h264

In [12]:
os.environ["OPENCV_FFMPEG_CAPTURE_OPTIONS"] = ""

In [13]:
#export
cpu_cap_npa = CpuCapNpa(vid_path_264,-1,check_res)
cpu_time_0,n_frames = ProcVid0(cpu_cap_npa)
print(f'CPU 0 (no pre alloc): {n_frames} frames, {cpu_time_0:.2f} ms/frame')

CPU utilization - max: 60.77%, average 39.18%
CPU 0 (no pre alloc): 900 frames, 15.41 ms/frame


In [14]:
#export
cpu_cap = CpuCap(vid_path_264,-1,check_res)
cpu_time_1,n_frames = ProcVid0(cpu_cap)
print(f'CPU 1: {n_frames} frames, {cpu_time_1:.2f} ms/frame')

CPU utilization - max: 104.17%, average 76.64%
CPU 1: 900 frames, 8.63 ms/frame


In [15]:
if(check_res):
    CheckFrames(cpu_cap.res,cpu_cap_npa.res)

### h265

In [16]:
os.environ["OPENCV_FFMPEG_CAPTURE_OPTIONS"] = ""

In [17]:
#export
cpu_cap_npa = CpuCapNpa(vid_path_265,-1,check_res)
cpu_time_0,n_frames = ProcVid0(cpu_cap_npa)
print(f'CPU 0 (no pre alloc): {n_frames} frames, {cpu_time_0:.2f} ms/frame')

CPU utilization - max: 65.11%, average 36.68%
CPU 0 (no pre alloc): 900 frames, 34.34 ms/frame


In [18]:
#export
cpu_cap = CpuCap(vid_path_265,-1,check_res)
cpu_time_1,n_frames = ProcVid0(cpu_cap)
print(f'CPU 1: {n_frames} frames, {cpu_time_1:.2f} ms/frame')

CPU utilization - max: 75.61%, average 45.85%
CPU 1: 900 frames, 27.27 ms/frame


In [19]:
if(check_res):
    CheckFrames(cpu_cap.res,cpu_cap_npa.res)

<a id="cpu_quicksync"></a>

## CPU - Quicksync

If this implementation is slower than the default, software decoding is probably taking place.

To confirm hardware decoding in windows 10 check Video Decode window in the GPU pane of the task manager for activity as below. 

To fix check drivers, OpenCv version etc. 

![title](imgs/quicksync.PNG)

In [15]:
#export
cpu_cap_mfx_npa = CpuCapNpa(vid_path_264,-1,check_res,cv.CAP_INTEL_MFX)
cpu_time_mfx_0,n_frames = ProcVid0(cpu_cap_mfx_npa)
print(f'CPU Quick Sync (no pre alloc): {n_frames} frames, {cpu_time_mfx_0:.2f} ms/frame')

CPU Quick Sync (no pre alloc): 900 frames, 29.84 ms/frame


In [16]:
#export
cpu_cap_mfx = CpuCap(vid_path_264,-1,check_res,cv.CAP_INTEL_MFX)
cpu_time_mfx_1,n_frames = ProcVid0(cpu_cap_mfx)
print(f'CPU Quick Sync (no pre alloc): {n_frames} frames, {cpu_time_mfx_1:.2f} ms/frame')

CPU Quick Sync (no pre alloc): 900 frames, 28.68 ms/frame


<a id="gpu"></a>

## GPU

Not all GPU's have a hardware decoder, e.g. anything with GM108, see
[(1)](https://devtalk.nvidia.com/default/topic/1024934/video-codec-and-optical-flow-sdk/cuvidcreatedecoder-returns-error-cuda_error_no_device/) and 
[(2)](https://developer.nvidia.com/video-encode-decode-gpu-support-matrix) for discussions.

### cv.VideoCapture - h264_cuvid

#### h264

In [20]:
os.environ["OPENCV_FFMPEG_CAPTURE_OPTIONS"] = "video_codec;h264_cuvid"

In [21]:
#export
cpu_cap_npa = CpuCapNpa(vid_path_264,-1,check_res)
cpu_time_0,n_frames = ProcVid0(cpu_cap_npa)
print(f'CPU 0 with h264_cuvid (no pre alloc): {n_frames} frames, {cpu_time_0:.2f} ms/frame')

CPU utilization - max: 8.49%, average 8.23%
CPU 0 with h264_cuvid (no pre alloc): 900 frames, 32.84 ms/frame


In [22]:
#export
cpu_cap = CpuCap(vid_path_264,-1,check_res)
cpu_time_1,n_frames = ProcVid0(cpu_cap)
print(f'CPU 1 with h264_cuvid: {n_frames} frames, {cpu_time_1:.2f} ms/frame')

CPU utilization - max: 17.36%, average 8.30%
CPU 1 with h264_cuvid: 900 frames, 25.20 ms/frame


#### h265

In [23]:
os.environ["OPENCV_FFMPEG_CAPTURE_OPTIONS"] = "video_codec;hevc_cuvid"

In [24]:
#export
cpu_cap_npa = CpuCapNpa(vid_path_265,-1,check_res)
cpu_time_0,n_frames = ProcVid0(cpu_cap_npa)
print(f'CPU 0 with hevc_cuvid(no pre alloc): {n_frames} frames, {cpu_time_0:.2f} ms/frame')

CPU utilization - max: 12.60%, average 8.24%
CPU 0 with hevc_cuvid(no pre alloc): 900 frames, 38.32 ms/frame


In [25]:
#export
cpu_cap = CpuCap(vid_path_265,-1,check_res)
cpu_time_1,n_frames = ProcVid0(cpu_cap)
print(f'CPU 1 with hevc_cuvid: {n_frames} frames, {cpu_time_1:.2f} ms/frame')

CPU utilization - max: 16.28%, average 8.24%
CPU 1 with hevc_cuvid: 900 frames, 30.20 ms/frame


### cv.cudacodec.VideoReader

In [26]:
#export
gpu_cap_npa = CudaCapNpa(vid_path_264,-1,check_res)
gpu_time_0,n_frames = ProcVid0(gpu_cap_npa)
print(f'GPU 0 (no pre alloc): {n_frames} frames, {gpu_time_0:.2f} ms/frame')

CPU utilization - max: 8.68%, average 8.38%
GPU 0 (no pre alloc): 900 frames, 5.89 ms/frame


In [27]:
gpu_cap_npa = CudaCapNpa(vid_path_264,-1,check_res)
gpu_time_0,n_frames = ProcVid0(gpu_cap_npa)
print(f'GPU 0 (no pre alloc): {n_frames} frames, {gpu_time_0:.2f} ms/frame')

CPU utilization - max: 8.68%, average 8.40%
GPU 0 (no pre alloc): 900 frames, 5.81 ms/frame


In [28]:
if(check_res):
    n_frames = min(len(gpu_cap.res),len(gpu_cap_npa.res))
    CheckFrames(gpu_cap.res[:n_frames],gpu_cap_npa.res[:n_frames])

<a id='opencv_decoding_performance_comparisson'></a>

## Results

### h264

In [29]:
results = [['i7-6700HQ', 39.39],['i7-6700HQ (pre-alloc)', 23.99],['i5-6500 (pre-alloc)',22.01],['i5-5200U HDD',58.64],
           ['i5-5200U HDD (pre-alloc)',51.06],['HD Graphics 530',14.73],['HD Graphics 530 (pre-alloc)',9.27],
           ['HD Graphics 5500 HDD',23.74],['HD Graphics 5500 HDD (pre-alloc)',18.03],['GTX 980M',14.34],
          ['GTX 980M (pre-alloc)',11.74],['GTX 1060 (pre-alloc)',7.85],['i5-4210U',50.65],['i5-4210U (pre-alloc)',47.72],
           ['GT 730M',40.80],['GT 730M (pre-alloc)',40.64],['HD Graphics 4400',23.88],['HD Graphics 4400 (pre-alloc)',13.97],
          ['i7-8700',19.07],['i7-8700 (pre-alloc)',12.09],['RTX 2080 Mobile h264_cuvid',32.84],
           ['RTX 2080 Mobile h264_cuvid (pre-alloc)',25.20],['RTX 2080 Mobile',6.03],['RTX 2080 Mobile (pre-alloc)',5.78]]
df = pd.DataFrame(results,columns=['CPU/GPU','Frame Proc Time (ms)']).sort_values('Frame Proc Time (ms)').round(2)
df.style.hide_index()

CPU/GPU,Frame Proc Time (ms)
RTX 2080 Mobile (pre-alloc),5.78
RTX 2080 Mobile,6.03
GTX 1060 (pre-alloc),7.85
HD Graphics 530 (pre-alloc),9.27
GTX 980M (pre-alloc),11.74
i7-8700 (pre-alloc),12.09
HD Graphics 4400 (pre-alloc),13.97
GTX 980M,14.34
HD Graphics 530,14.73
HD Graphics 5500 HDD (pre-alloc),18.03


### h265

# Export

In [84]:
# taken from https://github.com/fastai/fastai_docs/blob/master/dev_nb/notebook2script.py
!python notebook2script.py opencv410x-video-read.ipynb

Converted opencv410x-video-read.ipynb to exp\nb_opencv410x-video-read.py


In [None]:
! python exp/nb_opencv410x-video-read.py