# Server VITS on Trition Docker
- 선수 노트북: 아래를 먼저 실행해야 합니다
    - 0.0-create-tts-vits-model.ipynb

## 1.  Setup environment
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.abspath("./vits"))

for i in sys.path:
    print(i)

/home/ec2-user/SageMaker/.cs/conda/envs/vits-conda-py310/lib/python310.zip
/home/ec2-user/SageMaker/.cs/conda/envs/vits-conda-py310/lib/python3.10
/home/ec2-user/SageMaker/.cs/conda/envs/vits-conda-py310/lib/python3.10/lib-dynload

/home/ec2-user/SageMaker/.cs/conda/envs/vits-conda-py310/lib/python3.10/site-packages
/home/ec2-user/SageMaker/lab/00-trition-tts-vits/02-tts-vits-docker-trition/vits


# 2. Trition 서빙 준비

## config.pbtxt 생성

In [2]:
%%writefile workspace/config.pbtxt
name: "tts-vits"
platform: "pytorch_libtorch"
max_batch_size: 8
input [
  {
    name: "x"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "x_length"
    data_type: TYPE_INT64
    dims: [ 1 ]
    reshape: { shape: [] }
  },
  {
    name: "noise_scale"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [] }
  },
  {
    name: "length_scale"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [] }
  },
  {
    name: "noise_scale_w"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [] }
  }
]
output [
  {
    name: "OUTPUT_0"
    data_type: TYPE_FP32
    dims: [ 1, -1 ]
  }
]
instance_group [
  {
    count: 1
    kind: KIND_GPU
  }
]
dynamic_batching {
  preferred_batch_size: [ 4, 8 ]
  max_queue_delay_microseconds: 100
}
default_model_filename: "model.pt"

Overwriting workspace/config.pbtxt


# 3. 아티펙트 패키징

## 모델 리파지토리 폴더 구조
```
model_serving_folder
    - model_name
        - version_number
            - model file
        - config file

# Example
hello-serve-pt
    - hello
        - 1
            - model.pt
        - config.pbtxt

```


##  아티펙트 폴더 생성 

In [3]:
import os
from local_utils.triton_util import make_folder_structure, copy_artifact, remove_folder

# triton-hello-serve-pt 폴더 생성
workspace_folder ="workspace"
model_serving_folder = 'triton-serve-jit'
model_name = "tts-vits"
model_file_name = "trace_vits.pt"

model_path = os.path.join(workspace_folder, model_file_name)
make_folder_structure(model_serving_folder, model_name=model_name)

model_config_path = os.path.join(workspace_folder, 'config.pbtxt')

 #copy_artifact(model_serving_folder, model_name, trace_model_name, model_config)
copy_artifact(model_serving_folder=model_serving_folder, 
            model_name=model_name, 
            model_artifact=model_path, 
            config=model_config_path)

triton-serve-jit:
tts-vits

triton-serve-jit/tts-vits:
1
config.pbtxt

triton-serve-jit/tts-vits/1:
model.pt


### 폴더 삭제
- 필요시 주석 제거하고 사용하세요.

In [4]:
# model_serving_folder = 'triton-serve-jit'
# remove_folder(model_serving_folder)

# 4. 로컬 도커에서 실행 테스트

## 4.0. 도커에서의 실행 테스트는 아래와 같은 순서로 진행 함.


#### (1) 터미널 실행

아래에서 둘 중의 한개의 도커를 실행 하시면 됩니다.

#### (2) SageMaker Triton 도커 컨테이너 실행 -- triton 24.05 version
```
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

docker pull 763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tritonserver:24.05-py3

# Move to current folder (e.g.: /home/ec2-user/SageMaker/lab/00-trition-tts-vits/ )

docker run --gpus=1 --rm -p8000:8000 -p8001:8001 -p8002:8002 -v `pwd`/triton-serve-jit:/models 763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tritonserver:24.05-py3 tritonserver --model-repository=/models --log-verbose=3 --log-info=1 --log-warning=1 --log-error=1
```

#### Option:  NVidia Triton 도커 컨테이너 실행
- 위의 터미널에 아래와 같이 명령어를 하나씩 실행 하세요.
```
docker run --gpus=1 --rm -p8000:8000 -p8001:8001 -p8002:8002 -v `pwd`/triton-serve-jit:/models nvcr.io/nvidia/tritonserver:22.08-py3 tritonserver --model-repository=/models --log-verbose=3 --log-info=1 --log-warning=1 --log-error=1
```


#### (3) 도커 관련 유용한 명령어
```
docker rm -f $(docker ps -qa)

# 도커 실행하여 들어가 보기
docker run -it --entrypoint /bin/bash nvcr.io/nvidia/tritonserver:22.08-py3
# 실행중인 도커 컨테이너 들어가기
docker exec -it <container_name_or_id> /bin/bash


```

## 4.1. !!! #### 터미널에 "Triton 도커 컨테이너 실행" 을 해주세요. ### !!!

# 5. Run Inference on Triton Client

## Triton Client 초기화

In [5]:
from local_utils.triton_util import setup_triton_client
triton_client, grpcclient = setup_triton_client()

## Import Lib for converting text to token ID 

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd


import torch
import commons
import utils
from text import text_to_sequence


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("vits/configs/ljs_base.json")

### Define Text to be spoken

In [7]:
# text = "Claude is AI for all of us. Whether you're brainstorming alone or building with a team of thousands, Claude is here to help"
text = "Today, we're announcing the Claude 3 model family, which sets new industry benchmarks across a wide range of cognitive tasks. The family includes three state-of-the-art models in ascending order of capability: Claude 3 Haiku, Claude 3 Sonnet, and Claude 3 Opus. Each successive model offers increasingly powerful performance, allowing users to select the optimal balance of intelligence, speed, and cost for their specific application."

## Create input variables

In [8]:
import numpy as np

def create_input_data(text, hps, noise_scale, noise_scale_w, length_scale):
    stn_tst = get_text(text, hps)
    x_tst = stn_tst
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    
    x_np = x_tst.detach().cpu().numpy() 
    x_np = x_np.reshape(1,-1)
    
    x_length_np = x_tst_lengths.detach().cpu().numpy() 
    noise_scale_data = np.array([noise_scale], dtype=np.float32)
    length_scale_data = np.array([noise_scale_w], dtype=np.float32)
    noise_scale_w_data = np.array([length_scale], dtype=np.float32)

    return (x_np, x_length_np, noise_scale_data, length_scale_data, noise_scale_w_data)


input_vars = create_input_data(text, hps, 
                              noise_scale=.667, 
                              noise_scale_w=0.8, 
                              length_scale=1)


In [9]:
x_np, x_length_np, noise_scale_data, length_scale_data, noise_scale_w_data = input_vars
# print variables
print("x_np shape:", x_np.shape)
print("x_np sample:", x_np[:5])  # Adjust the slice as needed

print("x_length_np shape:", x_length_np.shape)
print("x_length_np:", x_length_np)

print("noise_scale_data shape:", noise_scale_data.shape)
print("noise_scale_data:", noise_scale_data)

print("length_scale_data shape:", length_scale_data.shape)
print("length_scale_data:", length_scale_data)

print("noise_scale_w_data shape:", noise_scale_w_data.shape)
print("noise_scale_w_data:", noise_scale_w_data)


x_np shape: (1, 965)
x_np sample: [[  0  62   0  83   0  46   0 156   0  47   0 102   0   3   0  16   0  65
    0 102   0 123   0  16   0  70   0  56   0 156   0  43   0 135   0  56
    0  61   0 102   0 112   0  16   0  81   0  83   0  16   0  53   0  54
    0 156   0  76   0 158   0  46   0  16   0 119   0 123   0 156   0  51
    0 158   0  16   0  55   0 156   0  69   0 158   0  46   0  83   0  54
    0  16   0  48   0 156   0  72   0  55   0 102   0  54   0  51   0   3
    0  16   0  65   0 157   0 102   0  62   0 131   0  16   0  61   0 156
    0  86   0  62   0  61   0  16   0  56   0 156   0  63   0 158   0  16
    0 156   0 102   0  56   0  46   0 138   0  61   0  62   0 123   0  51
    0  16   0  44   0 156   0  86   0  56   0  62   0 131   0  55   0  69
    0 158   0 123   0  53   0  61   0  16   0  83   0  53   0 123   0 157
    0  69   0 158   0  61   0  16   0  70   0  16   0  65   0 156   0  43
    0 102   0  46   0  16   0 123   0 156   0  47   0 102   0  56   0  46
    

## Create payload for Triton client

In [10]:
import numpy as np
import tritonclient.grpc as grpcclient


def create_client_payload(x, x_length, noise_scale, length_scale, noise_scale_w):
    inputs = []

    # Determine the batch size from x
    batch_size = x.shape[0]

    # x input
    x_input = grpcclient.InferInput('x', x.shape, "INT64")
    x_input.set_data_from_numpy(x)
    inputs.append(x_input)

    # x_length input
    x_length_input = grpcclient.InferInput('x_length', [batch_size, 1], "INT64")
    x_length_input.set_data_from_numpy(x_length.reshape(batch_size, 1))
    inputs.append(x_length_input)

    # noise_scale input
    noise_scale_input = grpcclient.InferInput('noise_scale', [batch_size, 1], "FP32")
    noise_scale_input.set_data_from_numpy(np.full((batch_size, 1), noise_scale, dtype=np.float32))
    inputs.append(noise_scale_input)

    # length_scale input
    length_scale_input = grpcclient.InferInput('length_scale', [batch_size, 1], "FP32")
    length_scale_input.set_data_from_numpy(np.full((batch_size, 1), length_scale, dtype=np.float32))
    inputs.append(length_scale_input)

    # noise_scale_w input
    noise_scale_w_input = grpcclient.InferInput('noise_scale_w', [batch_size, 1], "FP32")
    noise_scale_w_input.set_data_from_numpy(np.full((batch_size, 1), noise_scale_w, dtype=np.float32))
    inputs.append(noise_scale_w_input)

    print("x data shape:", x.shape)
    print("x_length shape:", x_length.reshape(batch_size, 1).shape)
    print("noise_scale shape:", (batch_size, 1))
    print("length_scale shape:", (batch_size, 1))
    print("noise_scale_w shape:", (batch_size, 1))
    
    return inputs

inputs = create_client_payload(x=x_np, 
                      x_length=x_length_np, 
                      noise_scale=noise_scale_data, 
                      length_scale=length_scale_data,
                      noise_scale_w=noise_scale_w_data)
    


x data shape: (1, 965)
x_length shape: (1, 1)
noise_scale shape: (1, 1)
length_scale shape: (1, 1)
noise_scale_w shape: (1, 1)


##  Create output variable for Ttiton client

In [11]:
outputs = []
outputs.append(grpcclient.InferRequestedOutput('OUTPUT_0'))


# 6. inference on Triton Docker


In [12]:
from local_utils.triton_util import infer_triton_client

result = infer_triton_client(triton_client, model_name, inputs, outputs)

In [13]:
output0_data = result.as_numpy('OUTPUT_0')
output0_data.shape

(1, 1, 589824)

In [14]:
audio = output0_data[0,0]
audio

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))