# Model serving uisng triton
* Container: conda_pytorch_p310

## AutoReload

In [None]:
%load_ext autoreload
%autoreload 2

## 0.Parameters

In [None]:
import time
import boto3
import sagemaker

### 0.1. Sagemaker 

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    
    from sagemaker.local import LocalSession
    
    strInstanceType = "local_gpu"
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
        
    #dicDataChannels = {
    #    "training": f'file://{os.path.join(strLocalDataPath, "train.csv")}',
    #    "testing": f'file://{os.path.join(strLocalDataPath, "test.csv")}',
    #}
    
else:
    strInstanceType = "ml.m5.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    #dicDataChannels = {
    #    "training": os.path.join(strS3DataPath, "train.csv"), 
    #    "testing": os.path.join(strS3DataPath, "test.csv"), 
    #}

nInstanceCount = 1

strPrefix = "triton-ncf"
strModelName = "ncf_food_model"
strTrainedModelDir = "./model"
strModelServingFolder = "triton-docker-serve-pt"

strRegion = boto3.Session().region_name
strBucketName = sagemaker_session.default_bucket()
strExecutionRole = sagemaker.get_execution_role()

### 0.2. Triton Docker Image

In [None]:
from src.triton_util import account_id_map

* Deep learning contatiners
    - https://github.com/aws/deep-learning-containers/blob/master/available_images.md

In [None]:
base = "amazonaws.com.cn" if strRegion.startswith("cn-") else "amazonaws.com"
strTritonImageUri = (
    "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:23.03-py3".format(
        account_id=account_id_map[strRegion], region=strRegion, base=base
    )
)
# 23.01, 23.02, 23.03, 22.07 mme_triton_image_uri


In [None]:
print(f'strtTritonImageUri: {strTritonImageUri}')

In [None]:
print(f'strRegion: {strRegion}')
print(f'sagemaker_session: {sagemaker_session}')
print(f'strInstanceType: {strInstanceType}')

## 1. 모델 패키징 (model.tar.gz) 및 S3 업로딩

### 1.1 model loading 

In [None]:
from src.inference import model_fn

* 만약 prediction에 customization이 필요하다면, "./src/model.py"의 class NCF(nn.Module)의 forward 펑션 수정할 것 

In [None]:
ncf_food_model = model_fn(strTrainedModelDir)

In [None]:
ncf_food_model

### 1.2. Conversion to torchscript 

In [None]:
import torch
import numpy as np

In [None]:
def trace_model(mode, device, model, dummy_inputs, trace_model_name):

    model = model.eval()
    model.to(device)

    if mode == 'trace' : IR_model = torch.jit.trace(model, dummy_inputs)
    elif mode == 'script': IR_model = torch.jit.script(model)

    print(f"As {mode} : Model is saved {trace_model_name}")
    torch.jit.save(IR_model, trace_model_name)

    print("#### Load Test ####")    
    loaded_m = torch.jit.load(trace_model_name)    
    print(loaded_m.code)    
    dummy_user = dummy_inputs[0]
    dummy_item = dummy_inputs[1]    
    
    result = loaded_m(dummy_user, dummy_item)
    print("Result shape: ", result.shape) 

In [None]:
is_trace, is_script = True, False

if is_trace: mode = 'trace'    
elif is_script: mode = 'script'

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

user_np = np.zeros((1,100)).astype(np.int32)
item_np = np.random.randint(low=1, high=1000, size=(1,100)).astype(np.int32)

dummy_inputs = [
    torch.from_numpy(user_np).to(device),
    torch.from_numpy(item_np).to(device)
]

In [None]:
strTraceFoodModelName = 'ncf_food_model.pt'
trace_model(mode, device, ncf_food_model, dummy_inputs, strTraceFoodModelName) 

### 1.3.Create config.pbtxt

In [None]:
%%writefile ncf_food_config.pbtxt

name: "ncf_food_model"
platform: "pytorch_libtorch"
max_batch_size: 128
input [
  {
    name: "INPUT__0"
    data_type: TYPE_INT32
    dims: [100]
  },
  {
    name: "INPUT__1"
    data_type: TYPE_INT32
    dims: [100]
  }
]
output [
  {
    name: "OUTPUT__0"
    data_type: TYPE_FP32
    dims: [-1]
  }
]

### 1.4 Artifact packaging
- 아래와 닽은 폴더 구조를 생성해야 함.
```
model_serving_folder
    - model_name
        - version_number
            - model file
        - config file
        
# Example: 

triton-serve-pt
    - ncf_food
        - 1
            - model.pt
        - config.pbtxt

```

In [None]:
import os
from src.triton_util import copy_artifact

In [None]:
# ncf_food_model 폴더 생성
food_config = 'ncf_food_config.pbtxt'
copy_artifact(strModelServingFolder, strModelName, strTraceFoodModelName, food_config)

### 1.5 Upload model packages

In [None]:
import os
from src.triton_util import tar_artifact, upload_tar_s3

In [None]:
strModelTarFile = tar_artifact(strModelServingFolder, strModelName)    
print("strModelTarFile: ", strModelTarFile)
strModelUriPt = upload_tar_s3(sagemaker_session, strModelTarFile, strPrefix)
print("strModelUriPt: ", strModelUriPt)

### 1.6 Remove files

In [None]:
listFilePath = [
    strTraceFoodModelName,
    f'{strModelName}.model.tar.gz',
    food_config
]
for strFilePath in listFilePath:
    if os.path.exists(strFilePath):
        os.remove(strFilePath)
    else:
        print("Can not delete the file as it doesn't exists")

# 3. Serving and Inference

### 3.1 Local mode
- 내부적으로 Triton 서버가 구동시에 아래 URL 스크립트가 구동 됨.
    - 여기에 맞는 필요한 환경 변수를 넣어 줌.
    - https://raw.githubusercontent.com/triton-inference-server/server/main/docker/sagemaker/serve

#### 3.1.1 Depoly

In [None]:
import json
import numpy as np
from sagemaker.model import Model

In [None]:
ts = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

# endpoint variables
strSMModelName = f"{strPrefix}-mdl-{ts}" #sm_model_name
strEndpointConfigName = f"{strPrefix}-epc-{ts}" # endpoint_config_name
strEndpointName = f"{strPrefix}-ep-{ts}" # endpoint_name
strModelDataUrl = f"s3://{strBucketName}/{strPrefix}/" #model_data_url

In [None]:
dicContainerEnvs = {
                    "SAGEMAKER_TRITON_LOG_VERBOSE": "3",
                    "SAGEMAKER_TRITON_LOG_INFO": "1",
                    "SAGEMAKER_TRITON_LOG_WARNING" : "1",
                    "SAGEMAKER_TRITON_LOG_ERROR" : "1"
                 }

localPytorchModel = Model(
    model_data= strModelUriPt,
    image_uri = strTritonImageUri,
    role=strExecutionRole,
    env = dicContainerEnvs
)

In [None]:
localPredictor = localPytorchModel.deploy(
    instance_type=strInstanceType,
    initial_instance_count=1,
    endpoint_name=strEndpointName,
    wait=True,
    log=False,
)

#### 3.1.2 Inference

In [None]:
def create_sample_payload():
    # user
    user_np = np.zeros((1,100)).astype(np.int32)
    # item
    item_np = np.random.randint(low=1, high=1000, size=(1,100)).astype(np.int32)

    payload = {
        "inputs": [
            {"name": "INPUT__0", "shape": [1,100], 
             "datatype": "INT32", "data": user_np.tolist()},
            {"name": "INPUT__1", "shape": [1,100], 
             "datatype": "INT32", "data": item_np.tolist()},
        ]
    }
    
    return payload

payload = create_sample_payload()
print("payload: ", payload)

In [None]:
def single_model_invoke_endpoint(client,endpoint_name, payload): 
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/octet-stream", 
        Body=json.dumps(payload),
    )

    result = json.loads(response["Body"].read().decode("utf8"))
    
    return result

runtime_client = sagemaker.local.LocalSagemakerRuntimeClient()    
result = single_model_invoke_endpoint(runtime_client,strEndpointName, payload)
print("result : ", result)

#### 3.1.3 Delete endpoint

In [None]:
from src.inference_utils import delete_endpoint

In [None]:
client = sagemaker.local.LocalSagemakerClient()
delete_endpoint(client, strEndpointName)

### 3.2 Cloud mode

#### 3.2.1 Depoly

In [None]:
dicContainer = {
    "Image": strTritonImageUri,
    "ModelDataUrl": strModelUriPt
}

In [None]:
print(f'dicContainer: {dicContainer}')
print(f'strSMModelName: {strSMModelName}')

In [None]:
sm_client = boto3.client(service_name="sagemaker")

create_model_response = sm_client.create_model(
    ModelName=strSMModelName,
    ExecutionRoleArn=strExecutionRole,
    PrimaryContainer=dicContainer
)

In [None]:
print(f'Model Arn: {create_model_response["ModelArn"]}')

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=strEndpointConfigName,
    ProductionVariants=[
        {
            "InstanceType": strInstanceType,
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": strSMModelName,
            "VariantName": "AllTraffic",
        }
    ],
)

In [None]:
print(f'Endpoint Config Arn: {create_endpoint_config_response["EndpointConfigArn"]}')

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=strEndpointName,
    EndpointConfigName=strEndpointConfigName
)

In [None]:
print(f'Endpoint Arn: {create_endpoint_response["EndpointArn"]}')

In [None]:
%%time 

resp = sm_client.describe_endpoint(EndpointName=strEndpointName)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=strEndpointName)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

#### 3.2.2 Inference

In [None]:
runtime_client = boto3.Session().client('sagemaker-runtime')
single_model_invoke_endpoint(runtime_client,strEndpointName, payload)

#### 3.2.3 Delete endpoint

In [None]:
client = boto3.Session().client('sagemaker')
delete_endpoint(client, strEndpointName)