# cv.cudacodec

In [11]:
import cv2 as cv
import os
import time
import numpy as np
LINUX = False
import logging
import sys
date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

Quick example comparing cv.cudacodec.VideoReader/VideoWriter with cv.VideoCapture/VideoWriter on Windows 11 and Ubuntu using python wheels from https://github.com/cudawarped/opencv-python-cuda-wheels/releases/tag/4.6.0.20221102.

GPU: Mobile RTX 3070 Ti (5th gen decoder & 7th gen encoder)

CPU: i7-12700H

Results vary between windows and linux even though ubuntu is on wsl using the same driver through WDDM.  Hardware CPU decoding wasn't available on Linux and the FFMpeg libs were not built with nvcuvid support

In [2]:
# taken from https://test-videos.co.uk/jellyfish/mp4-h264
if(LINUX):
    vid_path_in_4k = '/home/b/media/jellyfish-120-mbps-4k-uhd-h264.mkv'
    vid_path_out_4k = '/home/b/media/jelly_4k.h264'
    vid_path_out_1080p = '/home/b/media/jelly.h264'
    vid_path_out_1080p_mp4 = '/home/b/media/jelly.mp4'
else:
    ## decode 4k, then transcode, then encode bench
    vid_path_in_4k = os.environ['USERPROFILE'] + "/Videos/LG_New_York_HDR_UHD_4K_Demo.ts"
    vid_path_in_out_4k = os.environ['USERPROFILE'] + '/Videos/jelly_4k_src.hevc'
    vid_path_out_4k = os.environ['USERPROFILE'] + '/Videos/jelly_4k.hevc'
    #vid_path_out_1080p = os.environ['USERPROFILE'] + '/Videos/jelly.h264'
    #vid_path_out_1080p_mp4 = os.environ['USERPROFILE'] + '/Videos/jelly.mp4'

In [3]:
print(cv.getBuildInformation())


  Version control:               4.6.0-508-g21133a2091

  Extra modules:
    Location (extra):            D:/repos/opencv/opencv-python/opencv_contrib/modules
    Version control (extra):     4.6.0-106-g9d84eaed

  Platform:
    Timestamp:                   2022-11-07T10:18:54Z
    Host:                        Windows 10.0.22000 AMD64
    CMake:                       3.24.1
    CMake generator:             Ninja
    CMake build tool:            C:/PROGRA~1/MICROS~2/2022/COMMUN~1/Common7/IDE/COMMON~1/MICROS~1/CMake/Ninja/ninja.exe
    MSVC:                        1933
    Configuration:               Release

  CPU/HW features:
    Baseline:                    SSE SSE2 SSE3
      requested:                 SSE3
    Dispatched code generation:  SSE4_1 SSE4_2 FP16 AVX AVX2 AVX512_SKX
      requested:                 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX
      SSE4_1 (16 files):         + SSSE3 SSE4_1
      SSE4_2 (1 files):          + SSSE3 SSE4_1 POPCNT SSE4_2
      FP16 (0 files):    

In [4]:
def cvFormat(color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    assert ((color_format == cv.cudacodec.COLOR_FORMAT_BGR) | (color_format == cv.cudacodec.COLOR_FORMAT_BGRA) | 
            (color_format == cv.cudacodec.COLOR_FORMAT_GRAY) | (color_format == cv.cudacodec.COLOR_FORMAT_NV_NV12)), \
            f'color_format {color_format} not supported!'
    if(color_format == cv.cudacodec.COLOR_FORMAT_BGRA): return cv.CV_8UC4
    elif((color_format == cv.cudacodec.COLOR_FORMAT_NV_NV12) | (color_format == cv.cudacodec.COLOR_FORMAT_GRAY)): return cv.CV_8UC1
    else: return cv.CV_8UC3

## Transcoding Example

Transcode from 1080p h264 to to 4k hevc for decoding benchmarks

### `cv::cudacodec::VideoReader` -> `cv::cudacodec::VideoWriter`

In [22]:
def transcode(vid_path_in, vid_path_out, codec = cv.cudacodec.H264, params = cv.cudacodec_VideoReaderInitParams(), 
              color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    stream = cv.cuda.Stream()
    reader = cv.cudacodec.createVideoReader(vid_path_in,params=params)
    reader.set(color_format)
    format = reader.format()
    if params.targetSz != (0,0):
        w,h = params.targetSz   
    else:
        w,h = (format.width,format.height)
        h  = (np.ceil(h/16)*16).astype(int)
    
    h_decode = h if color_format != cv.cudacodec.COLOR_FORMAT_NV_NV12 else int(h*1.5)
    frame = cv.cuda.GpuMat(h_decode,w,cvFormat(color_format))
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,colorFormat = color_format,stream=stream)
    n_frames = 0
    start = time.time()    
    ret, _ = reader.nextFrame(frame,stream)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = reader.nextFrame(frame,stream)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

#### First convert 1080p(h264) to 1080p(hevc) for benchmarking

##### Windows 11

In [19]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (3840,2160)
params.minNumDecodeSurfaces = 10 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_1080p,vid_path_in_out_4k,cv.cudacodec.HEVC,params)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(h264) to 4k(hevc) at fps= {fps:.2f}')

Windows 11: Transcoded 300 frames from 1080p(h264) to 4k(hevc) at fps= 98.50


#### Bench

Benchmark 4k transcoding - timings will be slightly optimistic because decoding begins as soon as the VideoWriter is created

##### Windows 11

In [21]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_out_4k,vid_path_out_4k,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11 (warmup): Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = transcode(vid_path_in_out_4k,vid_path_out_4k,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11 (warmup): Transcoded 300 frames from 1080p(hevc) to 1080p(h264) at fps = 269.87
Windows 11: Transcoded 300 frames from 1080p(hevc) to 1080p(h264) at fps = 427.85


##### Ubuntu 20.04 LTS WSL

In [12]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 537.07


### `cv::VideoCapture` -> `cv::cudacodec::VideoWriter`

In [13]:
def transcode_cpu_to_gpu(vid_path_in, vid_path_out, codec = cv.cudacodec.H264):
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG,(cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE))
    w = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    frame = np.zeros((h,w,3),dtype='uint8')
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,cv.cudacodec.COLOR_FORMAT_BGR)
    n_frames = 0
    start = time.time()
    ret, _ = cap.read(frame)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = cap.read(frame)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [14]:
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Windows 11 (warmup): Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11 (warmup): Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 194.77
Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 173.61


##### Ubuntu 20.04 LTS WSL

In [15]:
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 147.78


### `cv::VideoCapture` -> `cv::VideoCapture`

Hardware acceleration not available on Linux

In [16]:
def transcode_videocapture(vid_path_in, vid_path_out, hw_decode = False, hw_encode = False):   
    props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_decode else (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_encode else (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
    #fourcc = cv.VideoWriter_fourcc(*"mp4v")
    fourcc = cv.VideoWriter_fourcc(*"avc1")
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
    frame = np.zeros((height,width,3),dtype='uint8')
    n_frames = 0
    start = time.time()
    ret, _ = cap.read(frame)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = cap.read(frame)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

HW Encoding Only

In [17]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 174.95


HW Encoding and Decoding - the hw decoder is the bottleneck


In [18]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, True, True)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 103.11


##### Ubuntu 20.04 LTS WSL

In [19]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, True, True)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 97.68


## Decoding Examples

Using 4k Video to fully stress the decoding unit

### `cv::cudacodec::VideoReader`

In [5]:
def decode(vid_path_in, color_format = cv.cudacodec.COLOR_FORMAT_BGRA, params = cv.cudacodec_VideoReaderInitParams(), n_frames_d = 0):
    stream = cv.cuda.Stream()
    reader = cv.cudacodec.createVideoReader(vid_path_in,params=params)
    reader.set(color_format)
    format = reader.format()
    if params.targetSz != (0,0):
        w,h = params.targetSz   
    else:
        w,h = (format.width,format.height)
        h  = (np.ceil(h/16)*16).astype(int)
    frame = cv.cuda.GpuMat(h,w,cvFormat(color_format))
    n_frames = 0
    start = time.time()    
    ret, _ = reader.nextFrame(frame,stream)
    while(ret and ((n_frames < n_frames_d) or not n_frames_d)):
        n_frames += 1
        ret, _ = reader.nextFrame(frame,stream)
    stream.waitForCompletion()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [12]:
import warnings  
warnings.warn('Warning Message') 



In [13]:
vid_path_in_4k

'C:\\Users\\b/Videos/LG_New_York_HDR_UHD_4K_Demo.ts'

In [24]:
%%capture tmp
params = cv.cudacodec_VideoReaderInitParams()
#params.udpSource = True

reader = cv.cudacodec.createVideoReader(vid_path_in_4k)

In [25]:
tmp()

In [16]:
reader.nextFrame()

error: OpenCV(4.6.0-dev) D:\repos\opencv\opencv-python\opencv_contrib\modules\cudacodec\src\video_reader.cpp:173: error: (-2:Unspecified error) Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding. in function '`anonymous-namespace'::VideoReaderImpl::internalGrab'


In [28]:
params = cv.cudacodec_VideoReaderInitParams()
params.minNumDecodeSurfaces = 10 # maximum decoding performance
fps, n_frames = decode(vid_path_in_4k,cv.cudacodec.COLOR_FORMAT_BGR, params,10)
print(f'Windows 11: Decoded {n_frames} frames from 4k(h264) at fps= {fps:.2f}')

error: OpenCV(4.6.0-dev) D:\repos\opencv\opencv-python\opencv_contrib\modules\cudacodec\src\video_reader.cpp:173: error: (-2:Unspecified error) Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding. in function '`anonymous-namespace'::VideoReaderImpl::internalGrab'


##### Ubuntu 20.04 LTS WSL

In [9]:
params = cv.cudacodec_VideoReaderInitParams()
params.minNumDecodeSurfaces = 10 # maximum decoding performance
fps, n_frames = decode(vid_path_in_4k,cv.cudacodec.COLOR_FORMAT_BGR, params)
print(f'Ubuntu 20.04 LTS WSL: Decoded {n_frames} frames from 4k(h264) at fps= {fps:.2f}')

Ubuntu 20.04 LTS WSL: Decoded 900 frames from 4k(h264) at fps= 145.98


### `cv::VideoReader`

In [10]:
def decode_videocapture(vid_path_in, hw_decode = False):   
    props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_decode else (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    
    frame = np.zeros((height,width,3),dtype='uint8')
    n_frames = 0
    start = time.time()
    ret, _ = cap.read(frame)
    while(ret):
        n_frames += 1
        ret, _ = cap.read(frame)
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [11]:
fps, n_frames = decode_videocapture(vid_path_in_4k, False)
print(f'Windows 11: Decoded {n_frames} frames from 4k(h264) at fps= {fps:.2f}')

Windows 11: Decoded 900 frames from 4k(h264) at fps= 39.59


HW acceleration is slower through DirectX (D3D11), will be faster with OPENCV_FFMPEG_CAPTURE_OPTIONS=video_codec;h264_cuvid if FFMpeg is compiled with cuvid support

In [12]:
fps, n_frames = decode_videocapture(vid_path_in_4k, True)
print(f'Windows 11 (hw acceleration): Decoded {n_frames} frames from 4k(h264) at fps= {fps:.2f}')

Windows 11 (hw acceleration): Decoded 900 frames from 4k(h264) at fps= 29.28


##### Ubuntu 20.04 LTS WSL

In [13]:
fps, n_frames = decode_videocapture(vid_path_in_4k, False)
print(f'Ubuntu 20.04 LTS WSL: Decoded {n_frames} frames from 4k(h264) at fps= {fps:.2f}')

Ubuntu 20.04 LTS WSL: Decoded 900 frames from 4k(h264) at fps= 70.75


## Encoding Examples

Use 1080p video to allow all frames to be stored in memory before decoding

### `cv::cudacodec::VideoWriter`

In [17]:
def encode(vid_path_in, vid_path_out, codec = cv.cudacodec.H264, params = cv.cudacodec_VideoReaderInitParams(), 
              color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    stream = cv.cuda.Stream()
    reader = cv.cudacodec.createVideoReader(vid_path_in,params=params)
    reader.set(color_format)
    format = reader.format()
    if params.targetSz != (0,0):
        w,h = params.targetSz   
    else:
        w,h = (format.width,format.height)
        h  = (np.ceil(h/16)*16).astype(int)
    
    h_decode = h if color_format != cv.cudacodec.COLOR_FORMAT_NV_NV12 else int(h*1.5)
    frame = cv.cuda.GpuMat(h_decode,w,cvFormat(color_format))
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,colorFormat = color_format,stream=stream)
    n_frames = 0
    frames = [];
    ret, _ = reader.nextFrame(frame,stream)
    while(ret):
        frames.append(frame.clone())
        ret, _ = reader.nextFrame(frame,stream)
    stream.waitForCompletion()
    start = time.time()
    for frame_to_encode in frames:        
        n_frames += 1
        writer.write(frame_to_encode)        
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [22]:
params = cv.cudacodec_VideoReaderInitParams()
#params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = encode(vid_path_in_4k,vid_path_out_4k,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11 (warmup): Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = encode(vid_path_in_4k,vid_path_out_4k,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

error: OpenCV(4.6.0-dev) D:\repos\opencv\opencv-python\opencv\modules\core\src\cuda\gpu_mat.cu:116: error: (-217:Gpu API call) out of memory in function '<unnamed>::DefaultAllocator::allocate'


##### Ubuntu 20.04 LTS WSL

In [None]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = encode_bench(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Ubuntu 20.04 LTS WSL: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

In [None]:
### `cv::VideoWriter` with hardware acceleration - unable to disable on windows

In [None]:
def encode_bench_videocapture(vid_path_in, vid_path_out, hw_decode = False, hw_encode = False):   
    props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_decode else (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_encode else (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
    #fourcc = cv.VideoWriter_fourcc(*"mp4v")
    fourcc = cv.VideoWriter_fourcc(*"avc1")
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    
    frame = np.zeros((height,width,3),dtype='uint8')
    n_frames = 0
    frames = []
    ret, _ = cap.read(frame)
    while(ret):
        frames.append(frame.copy())
        ret, _ = cap.read(frame)
    #time.sleep(10)
    writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
    start = time.time()
    for frame_to_encode in frames:
        n_frames += 1
        writer.write(frame_to_encode)    
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

start = time.time()
time.sleep(1)
print(time.time() - start)
time.sleep(10)
print(time.time() - start)

In [None]:
# running decode kills the performance of inbuilt hardware encoder?

In [None]:
vid_path_in = vid_path_in_out_1080p
vid_path_out = vid_path_out_1080p_mp4
props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY)
cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
#fourcc = cv.VideoWriter_fourcc(*"mp4v")
fourcc = cv.VideoWriter_fourcc(*"avc1")
fps = cap.get(cv.CAP_PROP_FPS)
width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
#writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
frame = np.zeros((height,width,3),dtype='uint8')
n_frames = 0
frames = []
ret, _ = cap.read(frame)
while(ret):
    frames.append(frame.copy())
    ret, _ = cap.read(frame)
            


writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
#time.sleep(10)
n_frames = 0
start = time.time()
for frame_to_encode in frames:
    n_frames += 1
    writer.write(frame_to_encode)    
writer.release()
end = time.time()
n_frames/(end - start), n_frames
#return n_frames/(end - start), n_frames;

len(frames)

In [None]:
Decoding is the bottleneck.  Note: Running encoding twice has reduced performace memory?

In [None]:
##### Windows 11

In [None]:
fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

In [None]:
##### Ubuntu 20.04 LTS WSL

In [None]:
fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Ubuntu 20.04 LTS WSL:  Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

## Transcoding Example

Transcode from 1080p to 4k for decoding benchmarks

In [6]:
if(LINUX):
    vid_path_in_4k = '/home/b/media/jellyfish-120-mbps-4k-uhd-h264.mkv'
    vid_path_in_out_1080p = '/home/b/media/jelly_1080p.hevc'
    vid_path_out_1080p = '/home/b/media/jelly.h264'
    vid_path_out_1080p_mp4 = '/home/b/media/jelly.mp4'
else:
    vid_path_in_4k = os.environ['USERPROFILE'] + '/Videos/jellyfish-120-mbps-4k-uhd-h264.mkv'
    vid_path_in_out_1080p = os.environ['USERPROFILE'] + '/Videos/jelly_1080p.hevc'
    vid_path_out_1080p = os.environ['USERPROFILE'] + '/Videos/jelly.h264'
    vid_path_out_1080p_mp4 = os.environ['USERPROFILE'] + '/Videos/jelly.mp4'

In [7]:
print(cv.getBuildInformation())


  Version control:               4.6.0-508-g21133a2091

  Extra modules:
    Location (extra):            D:/repos/opencv/opencv-python/opencv_contrib/modules
    Version control (extra):     4.6.0-106-g9d84eaed

  Platform:
    Timestamp:                   2022-11-07T10:18:54Z
    Host:                        Windows 10.0.22000 AMD64
    CMake:                       3.24.1
    CMake generator:             Ninja
    CMake build tool:            C:/PROGRA~1/MICROS~2/2022/COMMUN~1/Common7/IDE/COMMON~1/MICROS~1/CMake/Ninja/ninja.exe
    MSVC:                        1933
    Configuration:               Release

  CPU/HW features:
    Baseline:                    SSE SSE2 SSE3
      requested:                 SSE3
    Dispatched code generation:  SSE4_1 SSE4_2 FP16 AVX AVX2 AVX512_SKX
      requested:                 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX
      SSE4_1 (16 files):         + SSSE3 SSE4_1
      SSE4_2 (1 files):          + SSSE3 SSE4_1 POPCNT SSE4_2
      FP16 (0 files):    

In [8]:
def cvFormat(color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    assert ((color_format == cv.cudacodec.COLOR_FORMAT_BGR) | (color_format == cv.cudacodec.COLOR_FORMAT_BGRA) | 
            (color_format == cv.cudacodec.COLOR_FORMAT_GRAY) | (color_format == cv.cudacodec.COLOR_FORMAT_NV_NV12)), \
            f'color_format {color_format} not supported!'
    if(color_format == cv.cudacodec.COLOR_FORMAT_BGRA): return cv.CV_8UC4
    elif((color_format == cv.cudacodec.COLOR_FORMAT_NV_NV12) | (color_format == cv.cudacodec.COLOR_FORMAT_GRAY)): return cv.CV_8UC1
    else: return cv.CV_8UC3

### `cv::cudacodec::VideoReader` -> `cv::cudacodec::VideoWriter`

In [9]:
def transcode(vid_path_in, vid_path_out, codec = cv.cudacodec.H264, params = cv.cudacodec_VideoReaderInitParams(), 
              color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    stream = cv.cuda.Stream()
    reader = cv.cudacodec.createVideoReader(vid_path_in,params=params)
    reader.set(color_format)
    format = reader.format()
    if params.targetSz != (0,0):
        w,h = params.targetSz   
    else:
        w,h = (format.width,format.height)
        h  = (np.ceil(h/16)*16).astype(int)
    
    h_decode = h if color_format != cv.cudacodec.COLOR_FORMAT_NV_NV12 else int(h*1.5)
    frame = cv.cuda.GpuMat(h_decode,w,cvFormat(color_format))
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,colorFormat = color_format,stream=stream)
    n_frames = 0
    start = time.time()    
    ret, _ = reader.nextFrame(frame,stream)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = reader.nextFrame(frame,stream)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

#### First convert 4K(h264) to 1080p(hevc) for benchmarking

##### Windows 11

In [10]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 10 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_4k,vid_path_in_out_1080p,cv.cudacodec.HEVC,params)
print(f'Windows 11: Transcoded {n_frames} frames from 4k(h264) to 1080p(hevc) at fps= {fps:.2f}')

Windows 11: Transcoded 900 frames from 4k(h264) to 1080p(hevc) at fps= 145.09


#### Bench

Benchmark 1080p transcoding - timings will be slightly optimistic because decoding begins as soon as the VideoWriter is created

##### Windows 11

In [11]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11 (warmup): Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = transcode(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11 (warmup): Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 538.10
Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 534.52


##### Ubuntu 20.04 LTS WSL

In [12]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = transcode(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 537.07


### `cv::VideoCapture` -> `cv::cudacodec::VideoWriter`

In [13]:
def transcode_cpu_to_gpu(vid_path_in, vid_path_out, codec = cv.cudacodec.H264):
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG,(cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE))
    w = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    frame = np.zeros((h,w,3),dtype='uint8')
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,cv.cudacodec.COLOR_FORMAT_BGR)
    n_frames = 0
    start = time.time()
    ret, _ = cap.read(frame)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = cap.read(frame)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [14]:
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Windows 11 (warmup): Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11 (warmup): Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 194.77
Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 173.61


##### Ubuntu 20.04 LTS WSL

In [15]:
fps, n_frames = transcode_cpu_to_gpu(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 147.78


### `cv::VideoCapture` -> `cv::VideoCapture`

Hardware acceleration not available on Linux

In [16]:
def transcode_videocapture(vid_path_in, vid_path_out, hw_decode = False, hw_encode = False):   
    props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_decode else (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_encode else (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
    #fourcc = cv.VideoWriter_fourcc(*"mp4v")
    fourcc = cv.VideoWriter_fourcc(*"avc1")
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
    frame = np.zeros((height,width,3),dtype='uint8')
    n_frames = 0
    start = time.time()
    ret, _ = cap.read(frame)
    while(ret):
        n_frames += 1
        writer.write(frame)
        ret, _ = cap.read(frame)
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

HW Encoding Only

In [17]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 174.95


HW Encoding and Decoding - the hw decoder is the bottleneck


In [18]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, True, True)
print(f'Windows 11: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 103.11


##### Ubuntu 20.04 LTS WSL

In [19]:
fps, n_frames = transcode_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, True, True)
print(f'Ubuntu 20.04 LTS WSL: Transcoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Transcoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 97.68


## Encoding Examples

### `cv::cudacodec::VideoWriter`

In [20]:
def encode_bench(vid_path_in, vid_path_out, codec = cv.cudacodec.H264, params = cv.cudacodec_VideoReaderInitParams(), 
              color_format = cv.cudacodec.COLOR_FORMAT_BGRA):
    stream = cv.cuda.Stream()
    reader = cv.cudacodec.createVideoReader(vid_path_in,params=params)
    reader.set(color_format)
    format = reader.format()
    if params.targetSz != (0,0):
        w,h = params.targetSz   
    else:
        w,h = (format.width,format.height)
        h  = (np.ceil(h/16)*16).astype(int)
    
    h_decode = h if color_format != cv.cudacodec.COLOR_FORMAT_NV_NV12 else int(h*1.5)
    frame = cv.cuda.GpuMat(h_decode,w,cvFormat(color_format))
    writer = cv.cudacodec.createVideoWriter(vid_path_out,[w,h],codec,colorFormat = color_format,stream=stream)
    n_frames = 0
    frames = [];
    ret, _ = reader.nextFrame(frame,stream)
    while(ret):
        frames.append(frame.clone())
        ret, _ = reader.nextFrame(frame,stream)
    stream.waitForCompletion()
    start = time.time()
    for frame_to_encode in frames:        
        n_frames += 1
        writer.write(frame_to_encode)        
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

##### Windows 11

In [21]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = encode_bench(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11 (warmup): Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')
fps, n_frames = encode_bench(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11 (warmup): Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 542.75
Windows 11: Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 537.46


##### Ubuntu 20.04 LTS WSL

In [22]:
params = cv.cudacodec_VideoReaderInitParams()
params.targetSz = (1920,1080)
params.minNumDecodeSurfaces = 30 # maximum decoding performance
fps, n_frames = encode_bench(vid_path_in_out_1080p,vid_path_out_1080p,cv.cudacodec.H264,params,cv.cudacodec.COLOR_FORMAT_NV_NV12)
print(f'Ubuntu 20.04 LTS WSL: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL: Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 538.94


### `cv::VideoWriter` with hardware acceleration - unable to disable on windows

In [23]:
def encode_bench_videocapture(vid_path_in, vid_path_out, hw_decode = False, hw_encode = False):   
    props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_decode else (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY) if hw_encode else (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
    cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
    #fourcc = cv.VideoWriter_fourcc(*"mp4v")
    fourcc = cv.VideoWriter_fourcc(*"avc1")
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    
    frame = np.zeros((height,width,3),dtype='uint8')
    n_frames = 0
    frames = []
    ret, _ = cap.read(frame)
    while(ret):
        frames.append(frame.copy())
        ret, _ = cap.read(frame)
    #time.sleep(10)
    writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
    start = time.time()
    for frame_to_encode in frames:
        n_frames += 1
        writer.write(frame_to_encode)    
    writer.release()
    end = time.time()
    return n_frames/(end - start), n_frames;

In [24]:
start = time.time()
time.sleep(1)
print(time.time() - start)
time.sleep(10)
print(time.time() - start)

1.001640796661377
11.010230779647827


In [25]:
# running decode kills the performance of inbuilt hardware encoder?

In [26]:
vid_path_in = vid_path_in_out_1080p
vid_path_out = vid_path_out_1080p_mp4
props_decode  = (cv.CAP_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_NONE)
props_encode  = (cv.VIDEOWRITER_PROP_HW_ACCELERATION, cv.VIDEO_ACCELERATION_ANY)
cap = cv.VideoCapture(vid_path_in,cv.CAP_FFMPEG, props_decode)
#fourcc = cv.VideoWriter_fourcc(*"mp4v")
fourcc = cv.VideoWriter_fourcc(*"avc1")
fps = cap.get(cv.CAP_PROP_FPS)
width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
#writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
frame = np.zeros((height,width,3),dtype='uint8')
n_frames = 0
frames = []
ret, _ = cap.read(frame)
while(ret):
    frames.append(frame.copy())
    ret, _ = cap.read(frame)
            

In [27]:

writer = cv.VideoWriter(vid_path_out, fourcc, fps, (width,height), props_encode)
#time.sleep(10)
n_frames = 0
start = time.time()
for frame_to_encode in frames:
    n_frames += 1
    writer.write(frame_to_encode)    
writer.release()
end = time.time()
n_frames/(end - start), n_frames
#return n_frames/(end - start), n_frames;

(240.43738011167878, 900)

In [28]:
len(frames)

900

Decoding is the bottleneck.  Note: Running encoding twice has reduced performace memory?

##### Windows 11

In [29]:
fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 260.64


In [32]:
fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Windows 11: Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Windows 11: Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 259.42


##### Ubuntu 20.04 LTS WSL

In [31]:
fps, n_frames = encode_bench_videocapture(vid_path_in_out_1080p,vid_path_out_1080p_mp4, False, True)
print(f'Ubuntu 20.04 LTS WSL:  Encoded {n_frames} frames from 1080p(hevc) to 1080p(h264) at fps = {fps:.2f}')

Ubuntu 20.04 LTS WSL:  Encoded 900 frames from 1080p(hevc) to 1080p(h264) at fps = 256.27
