In [1]:
#importing libraries for testing purposes

#Using code from https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/

! pip install timm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 39.0 MB/s eta 0:00:01
Installing collected packages: timm
Successfully installed timm-0.5.4


In [2]:
import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True)

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth


In [33]:
#print(efficientnet_b0)

In [3]:
model = efficientnet_b0.eval().to("cuda")
detections_batch = model(torch.randn(128, 3, 224, 224).to("cuda"))
detections_batch.shape

torch.Size([128, 1000])

In [4]:
cudnn.benchmark = True

def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [11]:
model = efficientnet_b0.eval().to("cuda")
benchmark(model, input_shape=(1, 3, 224, 224), nruns=100)

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 7.90 ms
Iteration 20/100, avg batch time 7.88 ms
Iteration 30/100, avg batch time 7.89 ms
Iteration 40/100, avg batch time 7.88 ms
Iteration 50/100, avg batch time 7.87 ms
Iteration 60/100, avg batch time 7.87 ms
Iteration 70/100, avg batch time 7.86 ms
Iteration 80/100, avg batch time 7.87 ms
Iteration 90/100, avg batch time 7.88 ms
Iteration 100/100, avg batch time 7.87 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 127.03 images/second


In [6]:
traced_model = torch.jit.trace(model, torch.randn((1,3,224,224)).to("cuda"))
torch.jit.save(traced_model, "efficientnet_b0_traced.jit.pt")
benchmark(traced_model, input_shape=(1, 3, 224, 224), nruns=100)

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 4.89 ms
Iteration 20/100, avg batch time 4.89 ms
Iteration 30/100, avg batch time 4.89 ms
Iteration 40/100, avg batch time 4.89 ms
Iteration 50/100, avg batch time 4.89 ms
Iteration 60/100, avg batch time 4.89 ms
Iteration 70/100, avg batch time 4.89 ms
Iteration 80/100, avg batch time 4.89 ms
Iteration 90/100, avg batch time 4.89 ms
Iteration 100/100, avg batch time 4.89 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 204.50 images/second


In [26]:
#print(model)

In [7]:
trt_model = torch_tensorrt.compile(model, 
    inputs= [torch_tensorrt.Input((1, 3, 224, 224),dtype=torch.half)],
    enabled_precisions= {torch.float, torch.half} # Run with FP16
)

The compiler is going to use the user setting Float16
This conflict may cause an error at runtime due to partial compilation being enabled and therefore
compatibility with PyTorch's data type convention is required.
If you do indeed see errors at runtime either:
- Remove the dtype spec for x.1
- Disable partial compilation by setting require_full_compilation to True


In [25]:
print(trt_model)

RecursiveScriptModule(original_name=EfficientNet_trt)


In [21]:
#trt_model.keys()

In [19]:
print(trt_model)

RecursiveScriptModule(original_name=EfficientNet_trt)


In [22]:
benchmark(trt_model, input_shape=(1, 3, 224, 224), nruns=100, dtype="fp16")

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 2.19 ms
Iteration 20/100, avg batch time 2.19 ms
Iteration 30/100, avg batch time 2.18 ms
Iteration 40/100, avg batch time 2.18 ms
Iteration 50/100, avg batch time 2.18 ms
Iteration 60/100, avg batch time 2.18 ms
Iteration 70/100, avg batch time 2.18 ms
Iteration 80/100, avg batch time 2.18 ms
Iteration 90/100, avg batch time 2.14 ms
Iteration 100/100, avg batch time 2.05 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 486.96 images/second


In [13]:
print(trt_model)

RecursiveScriptModule(original_name=EfficientNet_trt)


In [34]:
#traced_mod_trt = torch.jit.trace(trt_model, torch.randn((1,3,224,224)).to("cuda"))
torch.jit.save(trt_model, "efficientnet_b0_traced_trt4.pth")

In [35]:
#traced_mod_trt = torch.jit.trace(trt_model, torch.randn((1,3,224,224)).to("cuda"))
efficientnet_b0_traced_trt4 = torch.jit.load("efficientnet_b0_traced_trt2.pth")

In [36]:
benchmark(efficientnet_b0_traced_trt4, input_shape=(1, 3, 224, 224), nruns=100, dtype="fp16")

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 2.36 ms
Iteration 20/100, avg batch time 2.37 ms
Iteration 30/100, avg batch time 2.36 ms
Iteration 40/100, avg batch time 2.35 ms
Iteration 50/100, avg batch time 2.15 ms
Iteration 60/100, avg batch time 2.01 ms
Iteration 70/100, avg batch time 1.91 ms
Iteration 80/100, avg batch time 1.83 ms
Iteration 90/100, avg batch time 1.78 ms
Iteration 100/100, avg batch time 1.73 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 578.62 images/second
