In [None]:
!pip install --upgrade boto3 -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --upgrade sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
# For notebook instances (Amazon Linux)
!sudo yum update -y
!sudo yum install amazon-linux-extras
!sudo amazon-linux-extras install epel -y
!sudo yum update -y
!sudo yum install git-lfs git -y

将s3_dir修改为你希望存储模型文件的s3目录

In [None]:
repository = "Baichuan-inc/Baichuan2-13B-Chat-4bits"
url_suffix = repository + ".git"
model_id=repository.split("/")[-1]
s3_dir = "s3://your-bucket-name/your-s3-dir" #specify your s3 dir to store model repo example-> s3://your-bucket-name/your-s3-dir
s3_location = s3_dir + "/" + model_id + "/"
repo = "https://www.wisemodel.cn/" + url_suffix
local_model_dir = "./" + model_id + "/"

!echo $s3_location
!echo $repo
!echo $local_model_dir

从 wisemodel.cn下载模型，请确保notebook instance有足够的空间，baichuan2-13b-4bits大概需要16G的空间

In [None]:
!git lfs install
!git clone $repo
#!git clone https://www.wisemodel.cn/Baichuan-inc/Baichuan2-7B-Chat-4bits.git
#!git clone https://www.wisemodel.cn/Baichuan-inc/Baichuan2-7B-Chat-4bits.git

上传模型文件到s3目录

In [None]:
!aws s3 sync $local_model_dir $s3_location

In [11]:
!mkdir code

In [None]:
%%writefile code/requirements.txt

-i https://pypi.tuna.tsinghua.edu.cn/simple

diffusers
ftfy
spacy
boto3
sagemaker
nvgpu
sentencepiece
protobuf>=3.19.5,<3.20.1
transformers>=4.26.1
icetk
cpm_kernels
accelerate
colorama
bitsandbytes
transformers_stream_generator
xformers

In [None]:
%%writefile code/inference.py

# -*- coding: utf-8 -*-
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import os
import json
import uuid
import io
import sys

import traceback

from PIL import Image

import requests
import boto3
import sagemaker
import torch


from torch import autocast
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation.utils import GenerationConfig

LLM_NAME = "/opt/amazon/var/run/"
s3_location = "s3://your-bucket-name/your-s3-dir" #填入前面步骤里的s3_location的值, example-> s3://your-bucket-name/your-s3-dir

os.system(f"aws s3 sync {s3_location} {LLM_NAME}")


tokenizer = AutoTokenizer.from_pretrained(LLM_NAME, trust_remote_code=True)


def preprocess(text):
    text = text.replace("\n", "\\n").replace("\t", "\\t")
    return text

def postprocess(text):
    return text.replace("\\n", "\n").replace("\\t", "\t")

def answer(text, sample=True, top_p=0.45, temperature=0.01, model=None):
    text = preprocess(text)
    messages = []
    messages.append({"role": "user", "content": text})
    response = model.chat(tokenizer, messages)
        
    return postprocess(response)


# def model_fn(model_dir):
#     """
#     Load the model for inference,load model from os.environ['model_name'],diffult use stabilityai/stable-diffusion-2
    
#     """
#     print("=================model_fn_Start=================")
#     model = AutoModelForCausalLM.from_pretrained(LLM_NAME, trust_remote_code=True).half().cuda()
#     #model = model.to("cuda")
#     print("=================model_fn_End=================")
#     return model

def model_fn(model_dir):
    """
    Load the model for inference,load model from os.environ['model_name'],diffult use stabilityai/stable-diffusion-2
    
    """
    print("=================model_fn_Start=================")
    # model = AutoModelForCausalLM.from_pretrained(LLM_NAME, torch_dtype=torch.float16,
    #                                              trust_remote_code=True)
    # model = model.quantize(4).cuda()
    
    model = AutoModelForCausalLM.from_pretrained(LLM_NAME, device_map="auto",
                                                 trust_remote_code=True)
    # model = AutoModelForCausalLM.from_pretrained(LLM_NAME, device_map="auto",
    #                                              torch_dtype=torch.bfloat16, trust_remote_code=True)
    model.generation_config = GenerationConfig.from_pretrained(LLM_NAME)
    print("=================model_fn_End=================")
    return model



def input_fn(request_body, request_content_type):
    """
    Deserialize and prepare the prediction input
    """
    # {
    # "ask": "写一个文章，题目是未来城市"
    # }
    print(f"=================input_fn=================\n{request_content_type}\n{request_body}")
    input_data = json.loads(request_body)
    if 'ask' not in input_data:
        input_data['ask']="写一个文章，题目是未来城市"
    return input_data




def predict_fn(input_data, model):
    """
    Apply model to the incoming request
    """
    print("=================predict_fn=================")
   
    print('input_data: ', input_data)
    

    try:
        #if 'history' not in input_data:
        #    history = []
        #else:
        #    history = input_data['history']
        if 'temperature' not in input_data:
            temperature = 0.01
        else:
            temperature = input_data['temperature']
        #result, history = answer(input_data['ask'], history=history, model=model)
        result = answer(input_data['ask'], model=model)
        print(f'====result {result}====')
        return result
        
    except Exception as ex:
        traceback.print_exc(file=sys.stdout)
        print(f"=================Exception================={ex}")

    return 'Not found answer'


def output_fn(prediction, content_type):
    """
    Serialize and prepare the prediction output
    """
    print(content_type)
    return json.dumps(
        {
            'answer': prediction
        }
    )

In [None]:
import boto3
import sagemaker

account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

print(role)
print(bucket)
print(region_name)


if "cn-" in region_name:
    with open('./code/requirements.txt', 'r') as original: data = original.read()
    with open('./code/requirements.txt', 'w') as modified: modified.write("-i https://pypi.tuna.tsinghua.edu.cn/simple\n" + data)

!touch dummy
!tar czvf model.tar.gz dummy
assets_dir = 's3://{0}/{1}/assets/'.format(bucket, 'llm_chinese')
model_data = 's3://{0}/{1}/assets/model.tar.gz'.format(bucket, 'llm_chinese')
!aws s3 cp model.tar.gz $assets_dir
!rm -f dummy model.tar.gz

model_name = None
entry_point = 'inference.py'
# framework_version = '1.13.1'
# py_version = 'py39'
model_environment = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'420', 
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 
}

url = f'763104351884.dkr.ecr.{region_name}.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'
if "cn-" in region_name:
    url = f'727897471807.dkr.ecr.{region_name}.amazonaws.com.cn/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'
    
from sagemaker.huggingface.model import HuggingFaceModel
model = HuggingFaceModel(
    name = model_name,
    model_data = model_data,
    entry_point = entry_point,
    source_dir = './code',
    role = role,
    # framework_version = framework_version, 
    # py_version = py_version,
    # env = model_environment
    image_uri=url
)

#endpoint_name = 'hf-inference-baichuan-v1'
endpoint_name = 'pytorch-inference-llm-v1'
# instance_type='ml.g5.4xlarge' 
instance_type='ml.g4dn.4xlarge' 

instance_count = 1


import boto3

client = boto3.client('sagemaker')
try:
    response = client.delete_endpoint_config(EndpointConfigName=endpoint_name)
except:
    pass


from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor = model.deploy(
    endpoint_name = endpoint_name,
    instance_type = instance_type, 
    initial_instance_count = instance_count,
    serializer = JSONSerializer(),
    deserializer = JSONDeserializer()
)

### 测试


In [15]:
#休眠2分钟,确保模型可以完全加载
import time
time.sleep(120)

In [20]:
inputs= {
    "ask": "你好!"

}

response = predictor.predict(inputs)
print(response["answer"])

inputs= {
    "ask": "晚上睡不着应该怎么办"

}

response = predictor.predict(inputs)
print(response["answer"])

你好！有什么我可以帮助你的？
1.保持安静的环境：尽量让房间保持安静，避免噪音干扰。
2. 调整温度和湿度：保持室内温度和湿度适中，有助于入睡。
3. 调整光线：避免过亮的光线影响睡眠，可以使用遮光窗帘或眼罩来遮挡光线。
4. 放松身心：可以尝试一些放松身心的方法，如深呼吸、冥想等。
5. 适当运动：适当的运动可以帮助身体释放压力，促进睡眠。但要注意不要选择过于激烈的运动，以免让身体过于兴奋。
6. 避免过度刺激大脑：尽量避免看刺激性强的电视节目或者玩刺激性的游戏，以免大脑过于兴奋难以入睡。
7. 限制咖啡因和酒精的摄入：尽量避免在晚上摄入过多的咖啡因和酒精，这些物质会影响睡眠质量。
8. 建立规律的作息：尽量保持每天同一时间上床睡觉和起床，有助于调整生物钟，提高睡眠质量。


### 删除SageMaker  Endpoint
删除推理服务

In [None]:
predictor.delete_endpoint()