# Triton Inference Server Test

Comparing model inference of an image classification problem using a local PyTorch executed model and a nVidia Triton Inference Server end-point inference.

In [1]:
import argparse
import numpy as np
import sys
from functools import partial
import os
from tritonclient import grpc
import tritonclient.grpc.model_config_pb2 as mc
from tritonclient import http
from tritonclient.utils import triton_to_np_dtype
from tritonclient.utils import InferenceServerException
import torch
from clearml import InputModel, Task
import shutil
import pathlib

# Local modules
from cub_tools.trainer import ClearML_Ignite_Trainer
from cub_tools.args import get_parser
from cub_tools.config import get_cfg_defaults, get_key_value_dict
from cub_tools.triton import run_inference, get_model_info

**Models:**

    1. PNASNet - d1a0a6c9a33f4d1da1bdf4f81f1595fa
    
    2. EfficientNetB - 0c8c57cf3c804c14a303375a83509477
    
    3. SwinT Base - 2be1bb58183c42e2aeb5f9af8c9510fc
    
    4. GoogLeNet - e72b8460dea742068cbb461107c6f725

In [2]:
# Notebook run parameters
model_name = 'SwinTBase' # PNASNet EfficientNetB0, SwinTBase, GoogLeNet
n_tests = 15 # Number of test image batches

Notebook code starts from here.

In [3]:
if model_name == 'PNASNet':
    model = InputModel(model_id="d1a0a6c9a33f4d1da1bdf4f81f1595fa")
    model_config = '/home/edmorris/projects/image_classification/caltech_birds/scripts/configs/pytorchcv/pnasnetlarge_config.yaml'
    endpoint_name = 'cub200_pnasnet'

elif model_name == 'EfficientNetB0':
    model = InputModel(model_id="0c8c57cf3c804c14a303375a83509477")
    model_config = '/home/edmorris/projects/image_classification/caltech_birds/scripts/configs/pytorchcv/efficientnet_b0_config.yaml'
    endpoint_name = 'cub200_enetb0'

elif model_name == 'SwinTBase':
    model = InputModel(model_id="2be1bb58183c42e2aeb5f9af8c9510fc")
    model_config = '/home/edmorris/projects/image_classification/caltech_birds/scripts/configs/timm/swinbase_config.yaml'
    endpoint_name = 'cub200_swinbase'

elif model_name == 'GoogLeNet':
    model = InputModel(model_id="e72b8460dea742068cbb461107c6f725")
    model_config = '/home/edmorris/projects/image_classification/caltech_birds/scripts/configs/torchvision/googlenet_config.yaml'
    endpoint_name = 'cub200_googlenet'


In [4]:
# Get the model weights locally.
local_cache_path = model.get_local_copy()

In [5]:
cmd_args = [
    'DIRS.CLEAN_UP', False,     # Don't do anything to the directory structure.
    'MODEL.PRETRAINED', False,  # Don't load default weights, as we want to load our own.
    ]  
trainer = ClearML_Ignite_Trainer(task=None, config=model_config, cmd_args=cmd_args) # TODO Get config from clearml task at some point. From model.task to get task ID, and then pull the task config.


[INFO] Parameters Override:: ['DIRS.CLEAN_UP', False, 'MODEL.PRETRAINED', False]
DATA:
  DATA_DIR: /home/edmorris/projects/image_classification/caltech_birds/data/images
  NUM_CLASSES: 200
  TEST_DIR: test
  TRAIN_DIR: train
  TRANSFORMS:
    PARAMS:
      AGGRESIVE:
        persp_distortion_scale: 0.25
        rotation_range: (-10.0, 10.0)
        type: all
      DEFAULT:
        img_crop_size: 384
        img_resize: 512
    TYPE: default
DIRS:
  CLEAN_UP: False
  ROOT_DIR: /home/edmorris/projects/image_classification/caltech_birds
  WORKING_DIR: /home/edmorris/projects/image_classification/caltech_birds/models/classification/ignite_swin_base_patch4_window12_384
EARLY_STOPPING_PATIENCE: 5
MODEL:
  MODEL_LIBRARY: timm
  MODEL_NAME: swin_base_patch4_window12_384
  PRETRAINED: False
  WITH_AMP: False
  WITH_GRAD_SCALE: False
SYSTEM:
  LOG_HISTORY: True
TRAIN:
  BATCH_SIZE: 16
  LOSS:
    CRITERION: CrossEntropy
  NUM_EPOCHS: 40
  NUM_WORKERS: 4
  OPTIMIZER:
    PARAMS:
      lr: 0.001
 

In [6]:
print('Image Transformer Properties')
for key, value in trainer.config.DATA.TRANSFORMS.PARAMS.items():
    print('{0} :: {1}'.format(key,value))

Image Transformer Properties
DEFAULT :: img_crop_size: 384
img_resize: 512
AGGRESIVE :: persp_distortion_scale: 0.25
rotation_range: (-10.0, 10.0)
type: all


In [7]:
# Get a sample dataset for running inference with
trainer.create_datatransforms()
trainer.create_dataloaders(shuffle={'train' : True, 'test' : True})

***********************************************
**            DATASET SUMMARY                **
***********************************************
train  size::  5994  images
test  size::  5794  images
Number of classes::  200
***********************************************
[INFO] Created data loaders.


In [8]:
trainer.create_model(load_to_device=False)

[INFO] Successfully created model but NOT pushed it to the device cuda:0
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [16, 128, 96, 96]           6,272
         LayerNorm-2            [16, 9216, 128]             256
        PatchEmbed-3            [16, 9216, 128]               0
           Dropout-4            [16, 9216, 128]               0
         LayerNorm-5            [16, 9216, 128]             256
            Linear-6             [16, 144, 384]          49,536
           Softmax-7          [16, 4, 144, 144]               0
           Dropout-8          [16, 4, 144, 144]               0
            Linear-9             [16, 144, 128]          16,512
          Dropout-10             [16, 144, 128]               0
  WindowAttention-11             [16, 144, 128]               0
         Identity-12            [16, 9216, 128]               0
        LayerNorm-13          

In [9]:
trainer.update_model_from_checkpoint(checkpoint_file=local_cache_path)

[INFO] Successfully loaded weights into the model from weights file:: /home/edmorris/.clearml/cache/storage_manager/global/20e3119d38b61ad083794837894030cf.cub200_swin_base_patch4_window12_384_ignite_best_model_0.pt
[INFO] Successfully updated model and pushed it to the device cuda:0
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [16, 128, 96, 96]           6,272
         LayerNorm-2            [16, 9216, 128]             256
        PatchEmbed-3            [16, 9216, 128]               0
           Dropout-4            [16, 9216, 128]               0
         LayerNorm-5            [16, 9216, 128]             256
            Linear-6             [16, 144, 384]          49,536
           Softmax-7          [16, 4, 144, 144]               0
           Dropout-8          [16, 4, 144, 144]               0
            Linear-9             [16, 144, 128]          16,512
          

          Linear-218            [16, 144, 1536]         787,968
         Softmax-219         [16, 16, 144, 144]               0
         Dropout-220         [16, 16, 144, 144]               0
          Linear-221             [16, 144, 512]         262,656
         Dropout-222             [16, 144, 512]               0
 WindowAttention-223             [16, 144, 512]               0
        DropPath-224             [16, 576, 512]               0
       LayerNorm-225             [16, 576, 512]           1,024
          Linear-226            [16, 576, 2048]       1,050,624
            GELU-227            [16, 576, 2048]               0
         Dropout-228            [16, 576, 2048]               0
          Linear-229             [16, 576, 512]       1,049,088
         Dropout-230             [16, 576, 512]               0
             Mlp-231             [16, 576, 512]               0
        DropPath-232             [16, 576, 512]               0
SwinTransformerBlock-233             [16

In [10]:
print('-'*90)
print('Running a comparison of PyTorch and nVidia Triton Inference Server inference results.')
print('Model Name:: {} \nModel Endpoint:: {}'.format(model_name, endpoint_name))
print('-'*90)
for i in np.arange(1,n_tests,1):

    # Get a validation batch
    X, y = next(iter(trainer.val_loader))
    # Set the model into eval mode
    trainer.model.eval()
    # Push input images to gpu
    X_gpu = X.to(trainer.device)
    # Run inference on validatgion batch image
    y_prob_pred = trainer.model(X_gpu)
    # Get predicted classes
    _, y_pred = torch.max(y_prob_pred, 1)

    # Get Triton served predicted classes
    y_pred_proba_remote, y_pred_remote = run_inference(X.numpy(), X.shape, model_name=endpoint_name, VERBOSE=False)
    
    
    print('Result:: \ty\t\t:: {} \n\t \ty_pred[local]\t:: {} \n\t \ty_pred[triton]\t:: {} '.format(y.numpy(),y_pred.cpu().numpy(),y_pred_remote))
    print('')

------------------------------------------------------------------------------------------
Running a comparison of PyTorch and nVidia Triton Inference Server inference results.
Model Name:: SwinTBase 
Model Endpoint:: cub200_swinbase
------------------------------------------------------------------------------------------
Result:: 	y		:: [144  57] 
	 	y_pred[local]	:: [146  57] 
	 	y_pred[triton]	:: [146  57] 

Result:: 	y		:: [11 89] 
	 	y_pred[local]	:: [11 89] 
	 	y_pred[triton]	:: [11 89] 

Result:: 	y		:: [166  45] 
	 	y_pred[local]	:: [166  45] 
	 	y_pred[triton]	:: [166  45] 

Result:: 	y		:: [ 90 107] 
	 	y_pred[local]	:: [ 90 107] 
	 	y_pred[triton]	:: [ 90 107] 

Result:: 	y		:: [116  89] 
	 	y_pred[local]	:: [131  89] 
	 	y_pred[triton]	:: [131  89] 

Result:: 	y		:: [144  53] 
	 	y_pred[local]	:: [144  53] 
	 	y_pred[triton]	:: [144  53] 

Result:: 	y		:: [114 130] 
	 	y_pred[local]	:: [114 120] 
	 	y_pred[triton]	:: [114 120] 

Result:: 	y		:: [30 38] 
	 	y_pred[local]	::