<a href="https://colab.research.google.com/github/crazyCoderLi/benchmark_problem/blob/main/benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preparations
Most of these preparations are directly from the official turorials with only a few changes, which mainly include:
*   import packages
*   download the sample video









In [1]:
try:
    import torch
except ModuleNotFoundError:
    !pip install torch torchvision
    import os
    import sys
    import torch
    
if torch.__version__=='1.6.0+cu101' and sys.platform.startswith('linux'):
    !pip install pytorchvideo
else:
    need_pytorchvideo=False
    try:
        # Running notebook locally
        import pytorchvideo
    except ModuleNotFoundError:
        need_pytorchvideo=True
    if need_pytorchvideo:
        # Install from GitHub
        !pip install "git+https://github.com/facebookresearch/pytorchvideo.git"

Collecting git+https://github.com/facebookresearch/pytorchvideo.git
  Cloning https://github.com/facebookresearch/pytorchvideo.git to /tmp/pip-req-build-loynhet6
  Running command git clone -q https://github.com/facebookresearch/pytorchvideo.git /tmp/pip-req-build-loynhet6
Collecting fvcore
  Downloading fvcore-0.1.5.post20210924.tar.gz (49 kB)
[K     |████████████████████████████████| 49 kB 2.7 MB/s 
[?25hCollecting av
  Downloading av-8.0.3-cp37-cp37m-manylinux2010_x86_64.whl (37.2 MB)
[K     |████████████████████████████████| 37.2 MB 32 kB/s 
[?25hCollecting parameterized
  Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Collecting iopath
  Downloading iopath-0.1.9-py3-none-any.whl (27 kB)
Collecting yacs>=0.1.6
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 49.0 MB/s 
Collecting portalocker
  Downloading portalocker-2.3.2-

In [2]:
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 
from typing import Dict
import time
import copy
import torch
import numpy as np

  "The _functional_video module is deprecated. Please use the functional module instead."
  "The _transforms_video module is deprecated. Please use the transforms module instead."


In [3]:
# Download the example video file
!wget https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4 

--2021-10-12 02:43:20--  https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 549197 (536K) [video/mp4]
Saving to: ‘archery.mp4’


2021-10-12 02:43:20 (1.57 MB/s) - ‘archery.mp4’ saved [549197/549197]



# 2. Video Preprocessing Class
Function: transform the video into a list of tensors that the model can handle.

By changing the initialized parameters, we can easily transform the video into different tensors for different models.

But in fact, I didn't fully understand the meaning of each parameter, so I just encapsulated them into the simple class and assigned the params default value as in the official sample code.

In [4]:
class PackPathway(torch.nn.Module):
    def __init__(self, alpha = 4):
        super().__init__()
        self.alpha = alpha

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // self.alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list


class Preprocess():
    ####################
    # SlowFast transform
    ####################
    def __init__(
            self,
            video_path,
            device = "cuda",
            side_size = 256,
            mean=[0.45, 0.45, 0.45],
            std = [0.225, 0.225, 0.225],
            crop_size = 256,
            num_frames = 32,
            sampling_rate = 2,
            frames_per_second = 30,

    ):
        self.video_path = video_path
        self.device = device
        self.side_size = side_size
        self.mean = mean
        self.std = std
        self.crop_size = crop_size
        self.num_frames = num_frames
        self.sampling_rate = sampling_rate
        self.frames_per_second = frames_per_second

        self.clip_duration = self.get_clip_duration()


    def transform(self):
        trans = ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(self.num_frames),
                    Lambda(lambda x: x / 255.0),
                    NormalizeVideo(self.mean, self.std),
                    ShortSideScale(
                        size=self.side_size
                    ),
                    CenterCropVideo(self.crop_size),
                    PackPathway()
                ]
            ),
        )
        return trans

    def get_clip_duration(self):
        return (self.num_frames * self.sampling_rate) / self.frames_per_second

    def get_processed_data(self):
        start_sec = 0
        end_sec = start_sec + self.clip_duration

        # Initialize an EncodedVideo helper class
        video = EncodedVideo.from_path(self.video_path)

        # Load the desired clip
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        # Apply a transform to normalize the video input
        video_data = self.transform()(video_data)

        # Move the inputs to the desired device
        inputs = video_data["video"]
        inputs = [i.to(self.device)[None, ...] for i in inputs]

        return inputs

# 3. Benchmarking Script Class
Function: load a model, run and evaluate its inference performance

By running the same sample data repeatly, we can record its single and total running time, which can help us get statistical results of latencies and throughput.

We can easily load different pre-trained model form torchhub by assigning different model names when initializing the class.


In [5]:
class BenchmarkingScript():
    '''
    A class for running the model inference with metrics testing. User can
    call the the method to run and test the model and print the tested
    latency and throughput.
    '''
    def __init__(
            self,
            video_path,
            preprocess_class,
            running_times,
            device,
            model_source,
            model_name,
            pretrained: bool = True
    ):
        '''
        :param video_path: the path of the video which you want to infer repeatly.
        :param preprocess_class: the class name that you want to preprocess the data.
        :param running_times: times you want to run.
        :param device: device to run the model.
        :param model_source: the models are loaded from torchhub, you should give the source and name of the model.
        :param model_name: the name of the model you want to load.
        :param pretrained: default is True.
        '''
        self.video_path = video_path
        self.preprocess_class = preprocess_class
        self.running_times = running_times
        self.device = device

        self.model_source = model_source
        self.model_name = model_name
        self.pretrained = pretrained

        self.throughput = 0
        self.latency_list = []

        self.processed_data = self.preprocess()
        self.model = self.load_model()

    def preprocess(self):
        '''
        For different models, we need different preprocessing classes,
        we can just pass the class name of preprocessing class to the init argument "preprocess_class".
        :param preprocess_class: you can pass different preprocess class for different goals.
        :return: the processed data
        '''
        pre = self.preprocess_class(self.video_path)
        return pre.get_processed_data()


    def load_model(self):
        '''
        The model will load from the torchhub, you can just pass the source and name of the model,
        and it will be loaded automaticly.
        :return: the model loaded from torchhub
        '''
        tempmodel = torch.hub.load(self.model_source, model=self.model_name, pretrained=self.pretrained)
        return tempmodel

    def run_script(self):
        '''
        The main function for running the model with metrics testing.
        :return: Nothing
        '''

        # move the model to desired device and turn to eval mode.
        self.model = self.model.to(self.device)
        self.model = self.model.eval()

        #warmup, discard the first few running data
        print("Start warming up!")
        for i in range(10):
            temp_data = copy.deepcopy(self.processed_data)
            self.model(temp_data)
            print(f'\tWarming up for {i+1} times')
        print("Warm up is over!")

        # start to infer!
        ful_start_time = time.time()
        for i in range(self.running_times):
            one_start_time = time.time()
            temp_data = copy.deepcopy(self.processed_data)
            self.model(temp_data)
            one_end_time = time.time()
            one_time = one_end_time - one_start_time
            self.latency_list.append(one_time)
            print(f'times:{i} latency:{one_time}')

        ful_end_time = time.time()
        ful_time = ful_end_time - ful_start_time

        self.throughput = self.running_times / ful_time

        p50_latency = np.percentile(self.latency_list, 50)
        p95_latency = np.percentile(self.latency_list, 95)
        p99_latency = np.percentile(self.latency_list, 99)

        print("==================================")
        print(f'Full running time: {ful_time}')
        print(f'throughput: {self.throughput} times/sec')
        print(f'p50_latency: {p50_latency}')
        print(f'p95_latency: {p95_latency}')
        print(f'p99_latency: {p99_latency}')
        print("==================================")


In [7]:
sc = BenchmarkingScript(
    video_path="archery.mp4",
    preprocess_class=Preprocess,
    running_times=100,
    device="cuda",
    model_source="facebookresearch/pytorchvideo:main",
    model_name="slowfast_r50"
)
sc.run_script()

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Start warming up!
	Warming up for 1 times
	Warming up for 2 times
	Warming up for 3 times
	Warming up for 4 times
	Warming up for 5 times
	Warming up for 6 times
	Warming up for 7 times
	Warming up for 8 times
	Warming up for 9 times
	Warming up for 10 times
Warm up is over!
times:0 latency:0.19149398803710938
times:1 latency:0.19211864471435547
times:2 latency:0.19182991981506348
times:3 latency:0.1910254955291748
times:4 latency:0.19181609153747559
times:5 latency:0.19239401817321777
times:6 latency:0.1922438144683838
times:7 latency:0.19068694114685059
times:8 latency:0.19369721412658691
times:9 latency:0.191619873046875
times:10 latency:0.19248247146606445
times:11 latency:0.19165611267089844
times:12 latency:0.19018769264221191
times:13 latency:0.19186782836914062
times:14 latency:0.19196796417236328
times:15 latency:0.19234681129455566
times:16 latency:0.19071650505065918
times:17 latency:0.19209694862365723
times:18 latency:0.1909482479095459
times:19 latency:0.1911945343017578
