# Comparisson of [Nvidia Video Codec SDK ](https://developer.nvidia.com/nvidia-video-codec-sdk) with CPU and iGPU ([Quick Sync](https://en.wikipedia.org/wiki/Intel_Quick_Sync_Video)) decoding using OpenCV 4.1.x with python

To run the notebook download modified binary [here](https://mega.nz/#!SAwCWY7D!Av4-wPjAkm6rlANWfJbp1R8HlahueT56bhJSAhvSN18).

Notes: 
1. Will not work correctly with OpenCV 4.1.0 because:
    - The python bindings do not work correctly, manually modified pyopencv_generated_types.h to enable cv.cudacodec.createVideoReadernextFrame() to work.
    - HENC not enabled for Nvidia decoder.
    - Quick Sync can load software decoder if more than one device is present (multiple GPU's, Nvidia Optimus etc.)
2. cv.cudacodec.createVideoReadernextFrame() returns before the end of the video file
3. CPU decoding supports far more codecs than the GPU, additionally GPU codec support depends on the GPU generation, see NVDEC_VideoDecoder_API_ProgGuide.pdf in the [NVidia Video Codec SDK documentation](https://developer.nvidia.com/nvidia-video-codec-sdk) for details.

## Init

In [2]:
#export
import os
import time
import numpy as np
from functools import partial
import matplotlib.pyplot as plt
import cv2 as cv
import pandas as pd

In [15]:
#export
# globals
#vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/768x576.avi'
#vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/1920x1080.avi'
#vid_path = 'rtsp://192.168.1.2/mediafile.264'

# test files from http://jell.yfish.us/
vid_path = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-h264.mkv'
#vid_path = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv'

check_res = False

In [3]:
def CheckFrames(f1,f2,epsilon = -1,rows = -1,cols = -1,channels = -1):
    assert len(f1) > 0 and len(f1) == len(f2), f'f1 length {len(f1)}, f2 length {len(f2)}'    
    epsilon = 0 if epsilon == -1 else epsilon
    rows = f1[0].shape[0] if rows == -1 else rows
    cols = f1[0].shape[1] if cols == -1 else cols
    channels = f1[0].shape[2] if channels == -1 and len(f1[0].shape) == 3  else channels    
    for i in range(0,len(f1)):
        assert np.sum(f1[i][:rows,:cols,:channels] != f2[i][:rows,:cols,:channels]) <= epsilon, f'frame {i} different'

In [4]:
%matplotlib inline

In [5]:
#export
def ProcVid0(cap):
    n_frames, start, end  = 0,0,0
    if (cap.IsOpen()== False): # replace with catch
        print("Error opening video stream or file")
        return
    frames_available = True
    start = time.time()    
    while(cap.IsOpen()):
        ret,_ = cap.GetFrame()
        if(ret):
            n_frames += 1 
    end = time.time()
    return (end - start)*1000/n_frames, n_frames;

In [6]:
#export
# host mem not implemented, manually pin memory
class PinnedMem(object):
    def __init__(self, size, dtype=np.uint8):
        self.array = np.empty(size,dtype)
        cv.cuda.registerPageLocked(self.array)
        self.pinned = True
    def __del__(self):
        cv.cuda.unregisterPageLocked(self.array)
        self.pinned = False
    def __repr__(self):
        return f'pinned = {self.pinned}'

In [26]:
#export
class VidCap:
    def __init__(self,vid_path,max_frames = -1,store_res = False):
        self.vid_path = vid_path
        self.store_res = store_res
        self.res = []
        self.frame_num = 0
        self.open = False
        cap = cv.VideoCapture(vid_path)
        assert cap.isOpened(), f"{vid_path}: cannot be opened!"
        self.num_frames = cap.get(cv.CAP_PROP_FRAME_COUNT)
        self.max_frames = self.num_frames if max_frames == -1 else max_frames
        ret, frame = cap.read()
        cap.release()
        self.rows,self.cols,self.channels = frame.shape
    
    def UpdateState(self,ret): 
        if (not ret or self.frame_num+1 == self.max_frames): 
            self.open = False            
        if(ret or self.frame_num+1 == self.max_frames):
            self.frame_num += 1
        
    def IsOpen(self): return self.open
        
class CudaCap(VidCap):
    def __init__(self,vid_path,max_frames=-1, store_res=False):
        VidCap.__init__(self, vid_path, max_frames, store_res)
        # cudacodec always returns 4 channels - check grey video
        self.channels = 4
        # cudacodec seems to need rows/16
        self.rows = (np.ceil(self.rows/16)*16).astype(int)
        self.cap = cv.cudacodec.createVideoReader(self.vid_path)
        self.open = True
        self.frame_device = cv.cuda_GpuMat(self.rows,self.cols,cv.CV_8UC4)
        self.frame_host = PinnedMem((self.rows,self.cols,self.channels))
            
    def GetFrame(self):
        if(self.store_res):
            ret,_ = self.GetHostFrame()
            if(ret):
                self.res.append(np.copy(self.frame_host.array))
            return ret,self.frame_device
        else:
            return self.GetDeviceFrame()
            
    def GetDeviceFrame(self):
        ret,_ = self.cap.nextFrame(self.frame_device)
        self.UpdateState(ret)
        return ret,self.frame_device
        
    def GetHostFrame(self):
        ret,_ = self.GetDeviceFrame()
        if(ret):
            self.frame_device.download(self.frame_host.array)
        return ret,self.frame_host.array
    
class CudaCapNpa(CudaCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False):
        CudaCap.__init__(self, vid_path, max_frames, store_res)
            
    def GetDeviceFrame(self):        
        ret,self.frame_device = self.cap.nextFrame()
        self.UpdateState(ret)
        return ret,self.frame_device
    
class CpuCap(VidCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False,backend=cv.CAP_ANY):
        VidCap.__init__(self, vid_path, max_frames, store_res)      

        self.cap = cv.VideoCapture(self.vid_path,backend)
        assert self.cap.isOpened(), f"{vid_path}: cannot be opened for backend: {backend}!"
        #if self.cap.isOpened():
        self.open = True
        self.frame = np.empty((self.rows,self.cols,self.channels),np.uint8)
            
    def GetFrame(self):
        ret,_ = self.cap.read(self.frame)
        self.UpdateState(ret)
        if (ret):
            if(self.store_res):
                self.res.append(np.copy(self.frame))
        return ret,self.frame
                
    def __del__(self):
        self.cap.release()
        
class CpuCapNpa(CpuCap):
    def __init__(self,vid_path,max_frames=-1,store_res=False,backend=cv.CAP_ANY):
        CpuCap.__init__(self, vid_path, max_frames, store_res,backend)      
            
    def GetFrame(self):
        ret,self.frame = self.cap.read()
        self.UpdateState(ret)
        if (ret):
            if(self.store_res):
                self.res.append(np.copy(self.frame))
        return ret,self.frame
    

<a id="cpu"></a>

## CPU

In [8]:
#export
cpu_cap_npa = CpuCapNpa(vid_path,-1,check_res)
cpu_time_0,n_frames = ProcVid0(cpu_cap_npa)
print(f'CPU 0 (no pre alloc): {n_frames} frames, {cpu_time_0:.2f} ms/frame')

CPU 0 (no pre alloc): 900 frames, 39.39 ms/frame


In [9]:
#export
cpu_cap = CpuCap(vid_path,-1,check_res)
cpu_time_1,n_frames = ProcVid0(cpu_cap)
print(f'CPU 1: {n_frames} frames, {cpu_time_1:.2f} ms/frame')

CPU 1: 900 frames, 23.99 ms/frame


In [10]:
if(check_res):
    CheckFrames(cpu_cap.res,cpu_cap_npa.res)

<a id="cpu_quicksync"></a>

## CPU - Quicksync

If this implementation is slower than the default, software decoding is probably taking place.

To confirm hardware decoding in windows 10 check Video Decode window in the GPU pane of the task manager for activity as below. 

To fix check drivers, OpenCv version etc. 

![title](imgs/quicksync.PNG)

In [35]:
#export
vid_path_h264 = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-h264.h264'

In [31]:
#export
cpu_cap_mfx_npa = CpuCapNpa(vid_path_h264,-1,check_res,cv.CAP_INTEL_MFX)
cpu_time_mfx_0,n_frames = ProcVid0(cpu_cap_mfx_npa)
print(f'CPU Quick Sync (no pre alloc): {n_frames} frames, {cpu_time_mfx_0:.2f} ms/frame')

CPU Quick Sync (no pre alloc): 900 frames, 14.73 ms/frame


In [33]:
#export
cpu_cap_mfx = CpuCap(vid_path_h264,-1,check_res,cv.CAP_INTEL_MFX)
cpu_time_mfx_1,n_frames = ProcVid0(cpu_cap_mfx)
print(f'CPU Quick Sync (no pre alloc): {n_frames} frames, {cpu_time_mfx_1:.2f} ms/frame')

CPU Quick Sync (no pre alloc): 900 frames, 9.27 ms/frame


<a id="gpu"></a>

## GPU

Not all GPU's have a hardware decoder, e.g. anything with GM108, see
[(1)](https://devtalk.nvidia.com/default/topic/1024934/video-codec-and-optical-flow-sdk/cuvidcreatedecoder-returns-error-cuda_error_no_device/) and 
[(2)](https://developer.nvidia.com/video-encode-decode-gpu-support-matrix) for discussions.

In [16]:
#export
gpu_cap_npa = CudaCapNpa(vid_path,-1,check_res)
gpu_time_0,n_frames = ProcVid0(gpu_cap_npa)
print(f'GPU 0 (no pre alloc): {n_frames} frames, {gpu_time_0:.2f} ms/frame')

GPU 0 (no pre alloc): 899 frames, 14.34 ms/frame


In [17]:
#export
gpu_cap = CudaCap(vid_path,-1,check_res)
gpu_time_1,n_frames = ProcVid0(gpu_cap)
print(f'GPU 1: {n_frames} frames, {gpu_time_1:.2f} ms/frame')

GPU 1: 899 frames, 11.74 ms/frame


In [18]:
if(check_res):
    n_frames = min(len(gpu_cap.res),len(gpu_cap_npa.res))
    CheckFrames(gpu_cap.res[:n_frames],gpu_cap_npa.res[:n_frames])

<a id='opencv_decoding_performance_comparisson'></a>

## Results

In [3]:
results = [['i7-6700HQ', 39.39],['i7-6700HQ (pre-alloc)', 23.99],['i5-6500 (pre-alloc)',22.01],['i5-5200U HDD',58.64],
           ['i5-5200U HDD (pre-alloc)',51.06],['HD Graphics 530',14.73],['HD Graphics 530 (pre-alloc)',9.27],
           ['HD Graphics 5500 HDD',23.74],['HD Graphics 5500 HDD (pre-alloc)',18.03],['GTX 980M',14.34],
          ['GTX 980M (pre-alloc)',11.74],['GTX 1060 (pre-alloc)',7.85],['i5-4210U',50.65],['i5-4210U (pre-alloc)',47.72],
           ['GT 730M',40.80],['GT 730M (pre-alloc)',40.64],['HD Graphics 4400',23.88],['HD Graphics 4400 (pre-alloc)',13.97]]
df = pd.DataFrame(results,columns=['CPU/GPU','Frame Proc Time (ms)']).sort_values('Frame Proc Time (ms)').round(2)
df.style.hide_index()

CPU/GPU,Frame Proc Time (ms)
GTX 1060 (pre-alloc),7.85
HD Graphics 530 (pre-alloc),9.27
GTX 980M (pre-alloc),11.74
HD Graphics 4400 (pre-alloc),13.97
GTX 980M,14.34
HD Graphics 530,14.73
HD Graphics 5500 HDD (pre-alloc),18.03
i5-6500 (pre-alloc),22.01
HD Graphics 5500 HDD,23.74
HD Graphics 4400,23.88


# Export

In [75]:
# taken from https://github.com/fastai/fastai_docs/blob/master/dev_nb/notebook2script.py
!python notebook2script.py opencv410x-video-read.ipynb

Converted opencv410x-video-read.ipynb to exp\nb_opencv410x-video-read.py


In [76]:
! python exp/nb_opencv410x-video-read.py

CPU 0 (no pre alloc): 900 frames, 37.96 ms/frame
CPU 1: 900 frames, 21.88 ms/frame
CPU Quick Sync (no pre alloc): 900 frames, 15.31 ms/frame
CPU Quick Sync (no pre alloc): 900 frames, 9.97 ms/frame
GPU 0 (no pre alloc): 899 frames, 13.84 ms/frame
GPU 1: 899 frames, 11.72 ms/frame
[ INFO:0] global D:\SSDBackup\Dev\Repos\opencv_fork_1\modules\videoio\src\videoio_registry.cpp (187) cv::`anonymous-namespace'::VideoBackendRegistry::VideoBackendRegistry VIDEOIO: Enabled backends(7, sorted by priority): FFMPEG(1000); GSTREAMER(990); INTEL_MFX(980); MSMF(970); DSHOW(960); CV_IMAGES(950); CV_MJPEG(940)
