In [None]:
import torch
from torch2trt import torch2trt
from torchvision.models.alexnet import alexnet

# create some regular pytorch model...
model = alexnet(pretrained=True).eval().cuda()

# create example data
x = torch.ones((1, 3, 224, 224)).cuda()

# convert to TensorRT feeding sample data as input
model_trt = torch2trt(model, [x])

In [None]:
y = model(x)
y_trt = model_trt(x)

# check the output against PyTorch
print(torch.max(torch.abs(y - y_trt)))

In [None]:
import torch
from torch2trt import *
import numpy as np
from torchvision import transforms
from torch.utils.data import DataLoader
import os

from layers import disp_to_depth
from utils import readlines
import datasets
import networks
import time
from thop import profile, clever_format

In [None]:
model_name = "resnet18"
load_weights_folder = os.path.join("/work", "garin0115", "models", model_name+"_256x832", "models", "weights_19")

encoder_path = os.path.join(load_weights_folder, "encoder.pth")
decoder_path = os.path.join(load_weights_folder, "depth.pth")
encoder_pth = torch.load(encoder_path)
decoder_pth = torch.load(decoder_path)
encoder = networks.ResnetEncoder(18, False)
decoder = networks.DepthDecoder(encoder.num_ch_enc)
encoder.load_state_dict({k: v for k, v in encoder_pth.items() if k in encoder.state_dict()})
decoder.load_state_dict(decoder_pth)

In [None]:
# encoder.eval()
encoder.cuda()
# decoder.eval()
# decoder.cuda()


# create example data
x = torch.ones((1, 3, 256, 832)).cuda()


x0 = torch.ones((1, 64, 128, 416)).cuda()
x1 = torch.ones((1, 64, 64, 208)).cuda()
x2 = torch.ones((1, 128, 32, 104)).cuda()
x3 = torch.ones((1, 256, 16, 52)).cuda()
x4 = torch.ones((1, 512, 8, 26)).cuda()

# convert to TensorRT feeding sample data as input
encoder_trt = torch2trt(encoder, [x])
encoder_trt.eval()
# decoder_trt = torch2trt(decoder, [x0, x1, x2, x3, x4])

In [None]:
t2 = time.time()
y_trt = encoder_trt(x)
FPS_trt = 1/(time.time()-t2)
t1 = time.time()
y = encoder(x)
FPS = 1/(time.time()-t1)


In [None]:
for yi, yi_trt in zip(y, y_trt):
    print(torch.max(torch.abs(yi - yi_trt)))

In [None]:
print(FPS, FPS_trt)

In [None]:
import tensorrt as trt

In [None]:
trt.__path__

# Test ONNX to TRT 

In [3]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

def build_engine(onnx_path, shape = [1,3,256,832]):

    """
    This is the function to create the TensorRT engine
    Args:
      onnx_path : Path to onnx_file. 
      shape : Shape of the input of the ONNX file. 
    """
    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = (1 << 30)
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
            res = parser.parse(model.read())
            print(res)
        network.get_input(0).shape = shape
        engine = builder.build_cuda_engine(network)
        return engine

def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)
        
def load_engine(trt_runtime, plan_path):
    with open(plan_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

In [4]:
import argparse
from onnx import ModelProto
import tensorrt as trt
import os

engine_name = "resnet18_skyLoss.plan"
onnx_path = os.path.join("/work", "garin0115", "models", "resnet18_skyLoss_256x832", "models", "weights_19", "resnet18_skyLoss.onnx")
batch_size = 1 

model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
engine = build_engine(onnx_path, shape=shape)
save_engine(engine, engine_name) 

True


In [None]:
engine_load = load_engine(trt_runtime, engine_name)

# TEST 

In [None]:
from torch2trt import *
from module_test import ModuleTest, MODULE_TESTS
import time
import argparse
import re
import runpy
from termcolor import colored

import torch
import torchvision



def run(self):
    # create module
    module = self.module_fn()
    module = module.to(self.device)
    module = module.type(self.dtype)
    module = module.eval()
    
    # create inputs for conversion
    inputs_conversion = ()
    for shape in self.input_shapes:
        inputs_conversion += (torch.zeros(shape).to(self.device).type(self.dtype), )
        
    # convert module
    module_trt = torch2trt(module, inputs_conversion, **self.torch2trt_kwargs)

    # create inputs for torch/trt.. copy of inputs to handle inplace ops
    inputs = ()
    for shape in self.input_shapes:
        inputs += (torch.randn(shape).to(self.device).type(self.dtype), )
    inputs_trt = tuple([tensor.clone() for tensor in inputs])


    # test output against original
    outputs = module(*inputs)
    outputs_trt = module_trt(*inputs_trt)

    if not isinstance(outputs, tuple):
        outputs = (outputs, )
    
    # compute max error
    max_error = 0
    for i in range(len(outputs)):
        max_error_i = torch.max(torch.abs(outputs[i] - outputs_trt[i]))
        if max_error_i > max_error:
            max_error = max_error_i
    
    # benchmark pytorch throughput
    torch.cuda.current_stream().synchronize()
    t0 = time.time()
    for i in range(50):
        outputs = module(*inputs)
    torch.cuda.current_stream().synchronize()
    t1 = time.time()
    
    fps = 50.0 / (t1 - t0)
    
    # benchmark tensorrt throughput
    torch.cuda.current_stream().synchronize()
    t0 = time.time()
    for i in range(50):
        outputs = module_trt(*inputs)
    torch.cuda.current_stream().synchronize()
    t1 = time.time()
    
    fps_trt = 50.0 / (t1 - t0)
    
    # benchmark pytorch latency
    torch.cuda.current_stream().synchronize()
    t0 = time.time()
    for i in range(50):
        outputs = module(*inputs)
        torch.cuda.current_stream().synchronize()
    t1 = time.time()
    
    ms = 1000.0 * (t1 - t0) / 50.0
    
    # benchmark tensorrt latency
    torch.cuda.current_stream().synchronize()
    t0 = time.time()
    for i in range(50):
        outputs = module_trt(*inputs)
        torch.cuda.current_stream().synchronize()
    t1 = time.time()
    
    ms_trt = 1000.0 * (t1 - t0) / 50.0
    
    return max_error, fps, fps_trt, ms, ms_trt
        
        
    
for include in []:
    runpy.run_module(include)

for test in MODULE_TESTS:

    # filter by module name
    name = test.module_name()
    if not re.search('interpolate', name):
        continue

    # run test
    max_error, fps, fps_trt, ms, ms_trt = run(test)

    # write entry
    line = '| %s | %s | %s | %s | %.2E | %.3g | %.3g | %.3g | %.3g |' % (name, test.dtype.__repr__().split('.')[-1], str(test.input_shapes), str(test.torch2trt_kwargs), max_error, fps, fps_trt, ms, ms_trt)

    print(line)

