In [1]:
import sys 
sys.path.append('..')
import numpy as np
import onnxruntime as ort
from tqdm import trange
from IPython.display import Video
from utils.video import write_video, transpose_and_clip

In [2]:
# load model session
options = ort.SessionOptions()
provider = 'CUDAExecutionProvider'
session = ort.InferenceSession(f'../models/decoder.onnx', options, [provider])

In [3]:
# print shapes
input_shapes = {i.name: i.shape for i in session.get_inputs()}
output_shapes = {i.name: i.shape for i in session.get_outputs()}
print('input shapes : ', input_shapes)
print('output shapes: ', output_shapes)

input shapes :  {'encoding_indices': ['b', 8, 16]}
output shapes:  {'big_decoded_img': ['b', 3, 128, 256]}


In [4]:
# load tokens
tokens = np.load("../examples/tokens.npy").astype(np.int64)

In [5]:
# decoding loop
decoded_video = []
for i in trange(len(tokens[:])):
  outputs = session.run(None, {'encoding_indices': tokens[i].reshape(1,8,16)})
  outputs = {o.name: x for o,x in zip(session.get_outputs(), outputs)}
  decoded_video.append(outputs['big_decoded_img'])

100%|██████████| 1200/1200 [00:14<00:00, 80.73it/s]


In [6]:
# transpose and format video
decoded_video = transpose_and_clip(decoded_video)

In [7]:
# save video
save_dst = '/tmp/decoded.mp4'
write_video(decoded_video, save_dst, fps=20)
Video(save_dst, embed=True, width=700)