In [5]:
# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Use NVIDIA TensorRT to Accelerate Text to Video Inference


**Author**: NVIDIA China SA team
**Disclaim**: This tutorial only demonstrates TesorRT usage, and doesn't meant to provide any absolute performance value. Please test on your own enviorment. 

Text to Video is a popluar application since 2023, and research and industry communities have released many Text to Video models. 

In this tutorial, we use Alibaba DAMO's [text-to-video-synthesis](https://huggingface.co/ali-vilab/modelscope-damo-text-to-video-synthesis) model to demonstrate our recommended workflow to accelerate Text to Video Inference using NVIDIA TensorRT, and performance gain. 

We observed **2.2X inference speedup** on TensorRT compared native PyTorch, using the same GPU.

Additional notes: 

* Some code came from  https://github.com/modelscope/modelscope/tree/v1.11.1/modelscope/models/multi_modal/video_synthesis, with small modifications


## Before Start

Before start, please prepare 

- 1 * NVIDIA GPU, at least 48GB GPU RAM; Ampere, Hopper or Ada 
- Driver Version: 535.104.12   CUDA Version: 12.2
- Docker and [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
- Access to NVIDIA NGC https://ngc.nvidia.com/ (we will use docker image from NGC)

## Build Docker image

Before using this notebook, you need to build docker image and start a container

build docker image:

```bash
cd docker
docker build -t nvcr.io/nvidia/tensorrt:22.11-py3-txt2video .
```


start a container:

```bash
docker run --gpus '"device=0"' -it --shm-size=8G --rm --net=host --ipc=host \
        --ulimit memlock=-1 --ulimit stack=67108864 \
        -v `pwd`:/workspace \
        nvcr.io/nvidia/tensorrt:22.11-py3-txt2video bash
```



## Download Model Files from HuggingFace

You need to download model files from https://huggingface.co/ali-vilab/modelscope-damo-text-to-video-synthesis/tree/main

You can copy your HF token from https://huggingface.co website, and use following script to download models.

In [None]:
from huggingface_hub import snapshot_download

repo = "ali-vilab/modelscope-damo-text-to-video-synthesis"
output_dir = "./models"
HF_TOKEN = <<YOUR_HF_TOKEN>>
snapshot_download(repo_id=repo, local_dir=output_dir, local_dir_use_symlinks=False, token=HF_TOKEN)

## Build TensorRT engine

- **Step 1** Convert models from pytorch to Onnx
- **Step 2** Build TensorRT engine from onnx files

#### Step 1 - Convert models from pytorch to Onnx

we use `torch.onnx.export` to convert model from pytorch to onnx format.

In [3]:
import json
from os import path as osp
from unet_sd import UNetSD
import torch

def convert_to_onnx(model_dir, onnx_dir, device=0):

    config = json.load(open(osp.join(model_dir, "configuration.json")))
    cfg = config["model"]["model_cfg"]
    cfg['temporal_attention'] = True if cfg[
        'temporal_attention'] == 'True' else False

    # Initialize unet
    sd_model = UNetSD(
        in_dim=cfg['unet_in_dim'],
        dim=cfg['unet_dim'],
        y_dim=cfg['unet_y_dim'],
        context_dim=cfg['unet_context_dim'],
        out_dim=cfg['unet_out_dim'],
        dim_mult=cfg['unet_dim_mult'],
        num_heads=cfg['unet_num_heads'],
        head_dim=cfg['unet_head_dim'],
        num_res_blocks=cfg['unet_res_blocks'],
        attn_scales=cfg['unet_attn_scales'],
        dropout=cfg['unet_dropout'],
        temporal_attention=cfg['temporal_attention'])
    sd_model.load_state_dict(
        torch.load(
            osp.join(model_dir, config["model"]["model_args"]["ckpt_unet"])),
        strict=True)
    sd_model.eval()
    sd_model.to(device)


    dummy_x = torch.randn(1, 4, config["model"]["model_args"]["max_frames"],
         config["model"]["model_args"].get("width", 256)//8,
         config["model"]["model_args"].get("width", 256)//8,
         dtype=torch.float32, device=0).cuda()  # noise tensor
    dummy_t = torch.randint(0, 1000, (1,), dtype=torch.int64, device=0).cuda()  # time steps tensor
    dummy_y = torch.randn(1, 77, 1024, dtype=torch.float32, device=0).cuda()  # text embeddings tensor

    # Export the model to ONNX, onnx->trt will failed if add dynamic axes
    onnx_file_path = osp.join(onnx_dir, "model.onnx")
    torch.onnx.export(sd_model,
                    (dummy_x, dummy_t, dummy_y),
                    onnx_file_path,
                    export_params=True,
                    opset_version=17,
                    do_constant_folding=True,
                    input_names=['x', 't', 'y'],
                    output_names=['output'],
                    )

    print("Convert to ONNX finished")

In [4]:
# Convert to ONNX
model_dir = "./models"
onnx_dir = "./models/onnx_test"
convert_to_onnx(model_dir, onnx_dir)

  if mask_last_frame_num > 0:
  if prob == 1:
  elif prob == 0:
  f = torch.tensor(f, dtype=torch.int32).to(0)
  f = torch.tensor(f, dtype=torch.int32).to(0)
  assert x.shape[1] == self.channels
  assert x.shape[1] == self.channels


Convert to ONNX finished


In [6]:
# check onnx model
!ls -hl ./models/onnx

total 5.3G
-rw-r--r-- 1 root root  45K Feb  4 16:03 input_blocks.0.0.weight
-rw-r--r-- 1 root root 640K Feb  4 16:03 input_blocks.0.1.proj_in.weight
-rw-r--r-- 1 root root 640K Feb  4 16:03 input_blocks.0.1.proj_out.weight
-rw-r--r-- 1 root root  16K Feb  4 16:03 input_blocks.0.1.transformer_blocks.0.ff.net.0.proj.bias
-rw-r--r-- 1 root root 1.6M Feb  4 16:03 input_blocks.1.0.emb_layers.1.weight
-rw-r--r-- 1 root root 3.6M Feb  4 16:03 input_blocks.1.0.in_layers.2.weight
-rw-r--r-- 1 root root 3.6M Feb  4 16:03 input_blocks.1.0.out_layers.3.weight
-rw-r--r-- 1 root root 1.2M Feb  4 16:03 input_blocks.1.0.temopral_conv.conv1.2.weight
-rw-r--r-- 1 root root 1.2M Feb  4 16:03 input_blocks.1.0.temopral_conv.conv2.3.weight
-rw-r--r-- 1 root root 1.2M Feb  4 16:03 input_blocks.1.0.temopral_conv.conv3.3.weight
-rw-r--r-- 1 root root 1.2M Feb  4 16:03 input_blocks.1.0.temopral_conv.conv4.3.weight
-rw-r--r-- 1 root root  10K Feb  4 16:03 input_blocks.1.1.transformer_blocks.0.ff.net.

### Convert models from pytorch to Onnx

Convert Models to Onnx



In [8]:
# we have trtexec pre-installed in the docker container
!which trtexec

/opt/tensorrt/bin/trtexec


In [1]:
!trtexec --onnx=./models/onnx/model.onnx \
  --saveEngine=./models/trt_fp16.engine \
  --fp16 --skipInference

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=./models/onnx/model.onnx --saveEngine=./models/trt_fp16.engine --fp16 --skipInference
[02/04/2024-16:20:29] [I] === Model Options ===
[02/04/2024-16:20:29] [I] Format: ONNX
[02/04/2024-16:20:29] [I] Model: ./models/onnx/model.onnx
[02/04/2024-16:20:29] [I] Output:
[02/04/2024-16:20:29] [I] === Build Options ===
[02/04/2024-16:20:29] [I] Max batch: explicit batch
[02/04/2024-16:20:29] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[02/04/2024-16:20:29] [I] minTiming: 1
[02/04/2024-16:20:29] [I] avgTiming: 8
[02/04/2024-16:20:29] [I] Precision: FP32+FP16
[02/04/2024-16:20:29] [I] LayerPrecisions: 
[02/04/2024-16:20:29] [I] Layer Device Types: 
[02/04/2024-16:20:29] [I] Calibration: 
[02/04/2024-16:20:29] [I] Refit: Disabled
[02/04/2024-16:20:29] [I] Version Compatible: Disabled
[02/04/2024-16:20:29] [I] TensorRT runtime: full
[02/04/2024-16:20:29] [I] Lean DLL Path: 
[02

[02/04/2024-16:31:16] [I] Engine built in 646.805 sec.
[02/04/2024-16:31:16] [I] [TRT] Loaded engine size: 2716 MiB
[02/04/2024-16:31:17] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 11672, GPU 7678 (MiB)
[02/04/2024-16:31:17] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 11672, GPU 7686 (MiB)
[02/04/2024-16:31:17] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +2691, now: CPU 0, GPU 2691 (MiB)
[02/04/2024-16:31:17] [I] Engine deserialized in 1.06871 sec.
[02/04/2024-16:31:17] [I] Skipped inference phase since --skipInference is added.
&&&& PASSED TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=./models/onnx/model.onnx --saveEngine=./models/trt_fp16.engine --fp16 --skipInference


In [2]:
# check tensorrt engine file
!ls -lh ./models/trt_fp16.engine

-rw-r--r-- 1 root root 2.7G Feb  4 16:31 ./models/trt_fp16.engine


## Compare Inference Speedup

In [3]:
device = 0
torch_model_dir = "./models"
trt_model_dir = "./models/trt_fp16.engine"

In [4]:
import torch
import time
import tensorrt as trt
import cupy as cp
import numpy as np
import open_clip
import json
from os import path as osp
from unet_sd import UNetSD
from cupyx.profiler import benchmark

trt.init_libnvinfer_plugins(None, "")

config = json.load(open(osp.join(torch_model_dir, "configuration.json")))

cfg = config["model"]["model_cfg"]
cfg['temporal_attention'] = True if cfg['temporal_attention'] == 'True' else False

# Initialize unet
sd_model = UNetSD(
    in_dim=cfg['unet_in_dim'],
    dim=cfg['unet_dim'],
    y_dim=cfg['unet_y_dim'],
    context_dim=cfg['unet_context_dim'],
    out_dim=cfg['unet_out_dim'],
    dim_mult=cfg['unet_dim_mult'],
    num_heads=cfg['unet_num_heads'],
    head_dim=cfg['unet_head_dim'],
    num_res_blocks=cfg['unet_res_blocks'],
    attn_scales=cfg['unet_attn_scales'],
    dropout=cfg['unet_dropout'],
    temporal_attention=cfg['temporal_attention'])
sd_model.load_state_dict(
    torch.load(
        osp.join(torch_model_dir, config["model"]["model_args"]["ckpt_unet"])),
    strict=True)
sd_model.eval()
# sd_model.half()
sd_model.to(device)


from EngineUtil import Engine


  from .autonotebook import tqdm as notebook_tqdm


In [5]:

n_warmup = 10
n_repeat = 10

# create dummy inputs
dummy_x = torch.randn(1, 4, config["model"]["model_args"]["max_frames"],
     config["model"]["model_args"].get("width", 256)//8,
     config["model"]["model_args"].get("width", 256)//8,
     dtype=torch.float32, device=0).cuda()   # noise tensor
dummy_t = torch.randint(0, 1000, (1,), dtype=torch.int32, device=0).cuda()  # time steps tensor
dummy_y = torch.randn(1, 77, 1024, device=0).cuda()  # text embeddings tensor


def pytorch_model():
    return sd_model(dummy_x, dummy_t, dummy_y)

engine = Engine(trt_model_dir)


Loading TensorRT engine: ./models/trt_fp16.engine
[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored
[I] Loading bytes from ./models/trt_fp16.engine


##### Benchmark

In [6]:
pytorch_benchmark = benchmark(pytorch_model, (), n_warmup=n_warmup, n_repeat=n_repeat)

trt_benchmark = benchmark(engine.infer,
    ({"x":dummy_x, "t":dummy_t, "y":dummy_y}, ),
    n_warmup=n_warmup, n_repeat=n_repeat
)

# see https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler._time._PerfCaseResult.html#cupyx.profiler._time._PerfCaseResult
print(pytorch_benchmark)
print(trt_benchmark)

pytorch_model       :    CPU: 119984.063 us   +/- 190.493 (min: 119600.165 / max: 120252.957) us     GPU-0: 211981.627 us   +/- 497.686 (min: 211470.337 / max: 212790.268) us
infer               :    CPU: 57663.214 us   +/- 198.205 (min: 57274.309 / max: 57940.356) us     GPU-0: 90644.787 us   +/- 235.943 (min: 90256.386 / max: 91005.951) us


In [9]:
pytorch_time = 120 + 212
trt_time = 58 + 91
speedup = pytorch_time/trt_time
print(f"pytorch inference {pytorch_time} ms, trt inference used {trt_time} ms; Speedup: {speedup}X")

pytorch inference 332 ms, trt inference used 149 ms; Speedup: 2.228187919463087X
