### 1. 安装HuggingFace 并下载模型到本地

In [49]:
!pip install huggingface-hub -Uqq -i https://pypi.tuna.tsinghua.edu.cn/simple/
!pip install -U sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple/

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/, https://pip.repos.neuron.amazonaws.com


In [50]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./LLM_chatglm_model")
local_model_path.mkdir(exist_ok=True)
model_name = "THUDM/chatglm-6b"
#commit_hash = "f83182484538e663a03d3f73647f10f89878f438"
commit_hash="4d458d04bb657d100a3d2206a02c9f47c640e5c5"

In [51]:
#执行这段，因为网络原因，可能会多次失败，需要反复执行
while True:
    try:
        snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)
        break
    except Exception:
        pass

Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]

### 2. 把模型拷贝到S3为后续部署做准备

In [55]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name


In [56]:
s3_model_prefix = "LLM-RAG/workshop/LLM_chatglm_model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/LLM_chatglm_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM-RAG/workshop/LLM_chatglm_deploy_code
model_snapshot_path: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/4d458d04bb657d100a3d2206a02c9f47c640e5c5


In [57]:

#上传模型至S3
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/4d458d04bb657d100a3d2206a02c9f47c640e5c5/.ipynb_checkpoints/configuration_chatglm-checkpoint.py to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/.ipynb_checkpoints/configuration_chatglm-checkpoint.py
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/4d458d04bb657d100a3d2206a02c9f47c640e5c5/.ipynb_checkpoints/README-checkpoint.md to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/.ipynb_checkpoints/README-checkpoint.md
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/4d458d04bb657d100a3d2206a02c9f47c640e5c5/.gitattributes to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/.gitattributes
upload: LLM_chatglm_model/models--THUDM--chatglm-6b/snapshots/4d458d04bb657d100a3d2206a02c9f47c640e5c5/.ipynb_checkpoints/modeling_chatglm-checkpoint.py to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/.ipynb_che

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [71]:
#适用于global
#inference_image_uri = (
#    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
#)

#中国区需要替换为下面的image_uri
inference_image_uri = (
     #f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
    f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.9.2-cu118"
)

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.9.2-cu118


In [87]:

import sagemaker
from sagemaker import image_uris

inference_image_uri = image_uris.retrieve(
    framework="djl-deepspeed",
    region=sess.boto_session.region_name,
    version="0.23.0"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118


In [88]:
!mkdir -p LLM_chatglm_deploy_code

In [89]:
%%writefile LLM_chatglm_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os

from transformers import pipeline, AutoModel, AutoTokenizer

def load_model(properties):
    logging.info(f"properties: {properties}")
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
   
    model = AutoModel.from_pretrained(model_location, trust_remote_code=True).half().cuda()
    
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def preprocess(text):
    text = text.replace("\n", "\\n").replace("\t", "\\t")
    return text

def postprocess(text):
    return text.replace("\\n", "\n").replace("\\t", "\t")

def answer(text, history=[], sample=True, top_p=0.45, temperature=0.01, model=None):
    text = preprocess(text)
    response, history = model.chat(tokenizer, text, history=history, temperature=temperature)
    
    return postprocess(response), history


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    input_data = inputs.get_as_json()
    logging.info(f"inputs: {input_data}")
    try:
        if 'history' not in input_data:
            history = []
        else:
            history = input_data['history']
        if 'temperature' not in input_data:
            temperature = 0.01
        else:
            temperature = input_data['temperature']
        response, history = answer(input_data['ask'], history=history, model=model)
        logging.info(f'====result {response}====')
        result = {"answer": response, "history" : history}
        return Output().add_as_json(result)
        
    except Exception as ex:
        traceback.print_exc(file=sys.stdout)
        logging.error(f"=================Exception================={ex}")
        
    result = {"answer": "No Answer", "history" : history}
    return Output().add_as_json(result)

Overwriting LLM_chatglm_deploy_code/model.py


In [90]:

print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

#使用打印出来的URL，替换下面一段代码中的S3Url

option.s3url ==> s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/


#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

In [91]:
%%writefile LLM_chatglm_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_model/

Overwriting LLM_chatglm_deploy_code/serving.properties


#### 注意: 必须把transformers升级到4.27.1以上，否则会出现 [Issue344](https://github.com/THUDM/ChatGLM-6B/issues/344)

如果是中国区建议添加国内的pip镜像,如下代码所示
```
%%writefile LLM_chatglm_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.28.1
```

In [92]:
%%writefile LLM_chatglm_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.28.1

Overwriting LLM_chatglm_deploy_code/requirements.txt


In [93]:
!rm model.tar.gz
!cd LLM_chatglm_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz LLM_chatglm_deploy_code

LLM_chatglm_deploy_code/
LLM_chatglm_deploy_code/serving.properties
LLM_chatglm_deploy_code/model.py
LLM_chatglm_deploy_code/requirements.txt


In [94]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/LLM_chatglm_deploy_code/model.tar.gz


### 4. 创建模型 & 创建endpoint

In [95]:
!aws sagemaker delete-endpoint --endpoint-name pytorch-inference-llm-v1
!aws sagemaker delete-endpoint-config --endpoint-config-name pytorch-inference-llm-v1
!aws sagemaker delete-model --model-name pytorch-inference-llm-v1

In [96]:
from sagemaker.utils import name_from_base
import boto3

model_name = 'pytorch-inference-llm-v1'# name_from_base(f"chatglm") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

pytorch-inference-llm-v1
Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118
Created Model: arn:aws-cn:sagemaker:cn-north-1:507392672631:model/pytorch-inference-llm-v1


In [97]:
endpoint_config_name =model_name # f"{model_name}-config"
endpoint_name =model_name # f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
           # "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:507392672631:endpoint-config/pytorch-inference-llm-v1',
 'ResponseMetadata': {'RequestId': '5ac710ef-228f-49c2-95ef-1bdc23d43418',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5ac710ef-228f-49c2-95ef-1bdc23d43418',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '109',
   'date': 'Wed, 09 Aug 2023 05:18:16 GMT'},
  'RetryAttempts': 0}}

In [98]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:507392672631:endpoint/pytorch-inference-llm-v1


#### 持续检测模型部署进度

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating


### 5. 模型测试

In [None]:
import json
import boto3

client = boto3.client('runtime.sagemaker')
sagemaker_endpoint_name='pytorch-inference-llm-v1'
def query_endpoint_with_json_payload(encoded_json):
    response = client.invoke_endpoint(EndpointName=sagemaker_endpoint_name, ContentType='application/json', Body=encoded_json)
    return response

def parse_response_texts(query_response):
    model_predictions = json.loads(query_response['Body'].read())
    generated_text = model_predictions["answer"]
    return generated_text


In [None]:
payload = {"ask": """已知信息:13英寸MacBook Pro。title: AI抢饭碗引发好莱坞大罢工！业内大佬警告：将产生多米诺骨牌效应
ctime: 1689554822
brief: ①前派拉蒙影业CEO巴里·迪勒表示，若不尽快解决好莱坞编剧和演员工会的罢工问题，将会带来毁灭性的影响；
②当地时间14日，美国演员工会宣布支持美国编剧协会，正式加入罢工行列，这也是60多年来好莱坞最大的两个工会组织首次同时罢工。
content: <p><strong>财联社7月17日讯（编辑 卞纯）</strong>人工智能（AI）对于美国好莱坞的冲击波仍在发酵。继美国编剧协会开始罢工以后，美国演员工会也加入了。</p>
<p>当地时间14日，美国演员工会宣布支持美国编剧协会，正式加入罢工行列，这也是60多年来好莱坞最大的两个工会组织首次同时罢工。此前，工会方面与制片公司未能就劳资问题达成协议。。</第2条信息>
<第3条信息>在人工智慧浪潮带动下，七家重量级科技股股价飙升40%至200%以上，成为标普500指数今年迄今17%的涨幅的主因，近期这几家公司更是被外界称为“绝地七骑士”，包括苹果、微软、谷歌、亚马逊、脸书、英伟达和特斯拉。</p>
<p>  目前华尔街对七巨头的财报获利预期都相当高，美银全球研究预计，未来12个月，这些公司的收益将平均增长19%，是标普500指数其他成份股8%预期增幅的2倍多。</p>
<p>在此背景下，特斯拉将于美东时间周三盘后（北京时间7月20日清晨）率先公布2023年二季度财报。值得关注的是，在最近一个月市场普遍预期Model 3将新增线控转向。</p>
<p><strong>一、自动驾驶必备黑科技，各路资本纷纷入局</strong></p>
<p>过去几年，智能汽车领域最热门的投资赛道，大多集中在智能驾驶的感知以及决策环节，比如激光雷达、毫米波雷达、AI芯片等。今年各路资本纷纷涌向线控底盘领域。。</第3条信息>
<第5条信息>今日两市炸板个股较多，炸板率维持高位，目前超40%。短线情绪指标显示，市场短线情绪午后持续回落，接近低迷区。title: 金融学子就业高度内卷：卷学历、卷证书、卷实习，仍不敌复合人才
ctime: 1689574264
brief: ①2022年金融财经专业的应届生超过100万，占高校毕业生的10%；
②人才饱和、竞争激烈将工作门槛无限拉高，金融学子们从各方面高度内卷；
③现在，复合背景的金融人才往往更受券商青睐。
content: <p><strong>财联社7月17日讯（记者 肖斐歆）</strong>金融专业曾是公认就业前景广、薪资水平高的行业，其中券商更是金融精英聚集地，是不少年轻人追逐的梦想工作去处。</p>
<p>据教育部统计，2022年高校毕业生数量达1076万人，金融财经专业的应届生超过100万。不仅如此，学霸也偏爱金融专业，高考状元扎堆去顶级学府学金融似乎已是常态。</p>
<p>门槛的无限拉高让金融学子们高度内卷，卷学历、卷证书、卷实习……伴随着券商降薪大潮的袭来和复合背景人才更受青睐的趋势，金融学子该何去何从？。</第5条信息>
<第6条信息></p>title: 金梓才获擢升财通基金副总经理，绩而优则仕与80后标签打足，年内已近10位基金经理获提升
ctime: 1689554818
brief: ①金梓才的升任可以窥见财通基金的权益布局思路；
②“绩优则仕”留人才，年内已有近10位基金经理升至副总。
content: <p><strong>财联社7月17日讯（记者 闫军）</stro
基于以上已知信息，简洁和专业的来回答用户的问题，并告知是依据哪些信息来进行回答的。
如果无法从已知信息中得到答案，请仅回答 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
问题是:
目前华尔街对七巨头期望怎么样？"""}
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
#print(query_response)
#display_answer(query_response)
generated_texts = parse_response_texts(query_response)
print(generated_texts)

In [None]:
# 使用 Markdown 格式打印模型输出
from IPython.display import display, Markdown, clear_output

def display_answer(response_all):
    for response, history in response_all:
        clear_output(wait=True)
        display(Markdown(response))
    return history

In [None]:
%%time
#chatglm1-6b g4dn.2xlarge
payload = {"ask": """
人工智能（AI）对于美国好莱坞的冲击波仍在发酵。继美国编剧协会开始罢工以后，美国演员工会也加入了。</p>
当地时间14日，美国演员工会宣布支持美国编剧协会，正式加入罢工行列，这也是60多年来好莱坞最大的两个工会组织首次同时罢工。此前，工会方面与制片公司未能就劳资问题达成协议。。</第2条信息>
在人工智慧浪潮带动下，七家重量级科技股股价飙升40%至200%以上，成为标普500指数今年迄今17%的涨幅的主因，近期这几家公司更是被外界称为“绝地七骑士”，包括苹果、微软、谷歌、亚马逊、脸书、英伟达和特斯拉。</p>
目前华尔街对七巨头的财报获利预期都相当高，美银全球研究预计，未来12个月，这些公司的收益将平均增长19%，是标普500指数其他成份股8%预期增幅的2倍多。</p>

基于以上已知信息，简洁和专业的来回答用户的问题，并告知是依据哪些信息来进行回答的。
如果无法从已知信息中得到答案，请仅回答 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
问题是:
目前华尔街对七巨头期望怎么样？"""}
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
#print(query_response)
#display_answer(query_response)
generated_texts = parse_response_texts(query_response)

display(Markdown(generated_texts))

In [None]:
%%time
#chatglm-v1
payload = {"ask": "晚上睡不着怎么办？"}
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
generated_texts = parse_response_texts(query_response)
print(generated_texts)

#### 清除模型Endpoint和config

In [None]:
import boto3
sagemaker_endpoint_name='pytorch-inference-llm-v1'
def cleanup():
    # 创建一个sagemaker客户端
    sagemaker = boto3.client('sagemaker')

    # 删除模型
    sagemaker.delete_model(ModelName=sagemaker_endpoint_name)

    # 删除终端节点配置
    sagemaker.delete_endpoint_config(EndpointConfigName=sagemaker_endpoint_name)

    # 删除终端节点
    sagemaker.delete_endpoint(EndpointName=sagemaker_endpoint_name)

In [None]:
#cleanup()
#!aws sagemaker delete-endpoint --endpoint-name pytorch-inference-llm-v1
#!aws sagemaker delete-endpoint-config --endpoint-config-name pytorch-inference-llm-v1
#!aws sagemaker delete-model --model-name pytorch-inference-llm-v1