# [모듈 1.0] 워밍업 : Triton Docker 처음 시작하기

# 1. 환경 셋업

## 1.1. 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./src')

# 2. Hello Model 준비

In [2]:
import torch
import numpy as np

class MyCell(torch.nn.Module):
    def __init__(self):
        super(MyCell, self).__init__()
        self.linear = torch.nn.Linear(4, 4)

    def forward(self, x, h):
        new_h = torch.tanh(self.linear(x) + h)
        return new_h



In [3]:
my_cell = MyCell()
x, h = torch.rand(3, 4), torch.rand(3, 4)

print(f"x: \n{x}, \n h: \n{h}")
my_cell(x,h)

x: 
tensor([[0.9908, 0.9052, 0.8996, 0.0180],
        [0.1980, 0.2825, 0.8672, 0.2721],
        [0.7820, 0.6974, 0.6783, 0.0481]]), 
 h: 
tensor([[0.7485, 0.3368, 0.1435, 0.2451],
        [0.9100, 0.6574, 0.9836, 0.2775],
        [0.4921, 0.1276, 0.6965, 0.3775]])


tensor([[0.8283, 0.5090, 0.5656, 0.1928],
        [0.8522, 0.4230, 0.7423, 0.0349],
        [0.7493, 0.3181, 0.7490, 0.2462]], grad_fn=<TanhBackward>)

# 3. Trition 서빙 준비

## 3.1. Torch Script 으로 변환

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

def trace_model(mode, device, model, dummy_inputs, trace_model_name):

    model = model.eval()
    model.to(device)

    if mode == 'trace' :
        IR_model = torch.jit.trace(model, dummy_inputs)

    elif mode == 'script':
        IR_model = torch.jit.script(model)

    print(f"As {mode} : Model is saved {trace_model_name}")
    torch.jit.save(IR_model, trace_model_name)

    print("#### Load Test ####")    
    loaded_m = torch.jit.load(trace_model_name)    
    print(loaded_m.code)    
    dummy_user = dummy_inputs[0]
    dummy_item = dummy_inputs[1]    
    
    result = loaded_m(dummy_user, dummy_item)
    print("Result : \n", result)


Using cuda device


In [5]:
is_trace = False
is_script = True
        
if is_trace:
    mode = 'trace'    
elif is_script:    
    mode = 'script'

dummy_inputs = [
    x.to(device),h.to(device)
    ]
print("dummy_inputs: \n", dummy_inputs)
        
trace_model_name = 'hello_model.pt'    
trace_model(mode, device, my_cell, dummy_inputs, trace_model_name)    


dummy_inputs: 
 [tensor([[0.9908, 0.9052, 0.8996, 0.0180],
        [0.1980, 0.2825, 0.8672, 0.2721],
        [0.7820, 0.6974, 0.6783, 0.0481]], device='cuda:0'), tensor([[0.7485, 0.3368, 0.1435, 0.2451],
        [0.9100, 0.6574, 0.9836, 0.2775],
        [0.4921, 0.1276, 0.6965, 0.3775]], device='cuda:0')]
As script : Model is saved hello_model.pt
#### Load Test ####
def forward(self,
    x: Tensor,
    h: Tensor) -> Tensor:
  _0 = torch.add((self.linear).forward(x, ), h, alpha=1)
  return torch.tanh(_0)

Result : 
 tensor([[0.8283, 0.5090, 0.5656, 0.1928],
        [0.8522, 0.4230, 0.7423, 0.0349],
        [0.7493, 0.3181, 0.7490, 0.2462]], device='cuda:0',
       grad_fn=<TanhBackward>)


## 3.2.config.pbtxt 생성

In [6]:
%%writefile hello_config.pbtxt
name: "hello"
platform: "pytorch_libtorch"
max_batch_size: 128
input [
  {
    name: "INPUT__0"
    data_type: TYPE_FP32
    dims: [3,4]
  },
  {
    name: "INPUT__1"
    data_type: TYPE_FP32
    dims: [3,4]
  }
]
output [
  {
    name: "OUTPUT__0"
    data_type: TYPE_FP32
    dims: [128,3,4]
  }
]

Overwriting hello_config.pbtxt


# 4. 아티펙트 패키징

## 모델 리파지토리 폴더 구조
```
model_serving_folder
    - model_name
        - version_number
            - model file
        - config file

# Example
hello-serve-pt
    - hello
        - 1
            - model.pt
        - config.pbtxt

```


## 4.1. hello 폴더 생성 및 아티펙트 카피

In [7]:
import os
from triton_util import make_folder_structure, copy_artifact, remove_folder

# triton-hello-serve-pt 폴더 생성
model_serving_folder = 'triton-hello-serve-pt'
model_name = 'hello'
make_folder_structure(model_serving_folder, model_name)

model_config = 'hello_config.pbtxt'
copy_artifact(model_serving_folder, model_name, trace_model_name, model_config)

triton-hello-serve-pt:
hello

triton-hello-serve-pt/hello:
1
config.pbtxt

triton-hello-serve-pt/hello/1:
model.pt


### 폴더 삭제
- 필요시 주석 제거하고 사용하세요.

In [14]:
# model_serving_folder = 'triton-hello-serve-pt'
# remove_folder(model_serving_folder)

triton-hello-serve-pt is removed


# 5. 로컬 도커에서 실행 테스트

## 5.0. 도커에서의 실행 테스트는 아래와 같은 순서로 진행 함.

#### (0) Triton Client 초기화
```
from triton_util import setup_triton_client
triton_client, grpcclient = setup_triton_client()
```

#### (1) 터미널 실행
![terminal.png](img/terminal.png)

#### (2) Triton 도커 컨테이너 실행
- 위의 터미널에 아래와 같이 명령어를 하나씩 실행 하세요.
```
cd /home/ec2-user/SageMaker/Neural-Collaborative-Filtering-On-SageMaker/2_Triton_Inference

docker run --gpus=1 --rm -p8000:8000 -p8001:8001 -p8002:8002 -v `pwd`/triton-hello-serve-pt:/models nvcr.io/nvidia/tritonserver:22.08-py3 tritonserver --model-repository=/models --log-verbose=3 --log-info=1 --log-warning=1 --log-error=1
```
#### (3) Triton 클라이언트로 추론 실행
#### (4) 도커 중단 및 삭제
```
docker rm -f $(docker ps -qa)
```

## 5.1. Triton Client 초기화

In [9]:
from triton_util import setup_triton_client
triton_client, grpcclient = setup_triton_client()

## 5.2. !!! #### 터미널에 "Triton 도커 컨테이너 실행" 을 해주세요. ### !!!

## 5.3. 입력 payload 생성

In [10]:
def create_client_payload():
    inputs = []

    inputs.append(grpcclient.InferInput('INPUT__0', [1,3,4], "FP32"))
    inputs.append(grpcclient.InferInput('INPUT__1', [1,3,4], "FP32"))

    input0_data = np.random.randn(1,3,4).astype(np.float32)
    # Initialize the data
    inputs[0].set_data_from_numpy(input0_data)


    input1_data = np.random.randn(1,3,4).astype(np.float32)
    inputs[1].set_data_from_numpy(input0_data)

    print("input0_data: \n",input0_data) 
    print("input1_data: \n",input1_data) 
    
    return inputs



In [11]:
inputs = create_client_payload()


input0_data: 
 [[[-1.7612643  -0.7525186  -0.98555493  0.8345339 ]
  [ 1.231041    0.4869961   0.5787533   1.8141615 ]
  [ 0.24886777 -1.0502387   1.3411019   0.88277024]]]
input1_data: 
 [[[-1.0084157  -0.24527276  0.6400702   1.1905762 ]
  [-0.44431686  0.41970655  0.5309932   0.11136666]
  [-0.1452226   2.5263047   1.3037627   0.58281195]]]


## 5.4. 출력 변수 생성

In [12]:
outputs = []
outputs.append(grpcclient.InferRequestedOutput('OUTPUT__0'))


## 5.5. Triton에 추론 요청


In [13]:
from triton_util import infer_triton_client

infer_triton_client(triton_client, model_name, inputs, outputs)

#### output #####
(1, 3, 4)
