In [1]:
#importing libraries for testing purposes

#Using code from https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/

! pip install timm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [20]:
import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

In [26]:
#traced_mod_trt = torch.jit.trace(trt_model, torch.randn((1,3,224,224)).to("cuda"))
efficientnet_b0_traced_trt4 = torch.jit.load("efficientnet_b0_traced_trt4.pth")

In [27]:
cudnn.benchmark = True

def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [29]:
benchmark(efficientnet_b0_traced_trt4, dtype='fp16', input_shape=(1, 3, 224, 224), nruns=100)

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 2.37 ms
Iteration 20/100, avg batch time 2.37 ms
Iteration 30/100, avg batch time 2.37 ms
Iteration 40/100, avg batch time 2.37 ms
Iteration 50/100, avg batch time 2.37 ms
Iteration 60/100, avg batch time 2.37 ms
Iteration 70/100, avg batch time 2.26 ms
Iteration 80/100, avg batch time 2.14 ms
Iteration 90/100, avg batch time 2.05 ms
Iteration 100/100, avg batch time 1.98 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 505.08 images/second
