In [2]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

## Edit code artifacts

In [10]:
# use huggingface model_id or s3url
!cat llama_2_model/serving.properties

engine=Python
option.tensor_parallel_degree=1
# option.model_id=LinkSoul/Chinese-Llama-2-7b
option.s3url=s3://genai.piyao.com/llm/meta-llama/Llama-2-7b-chat-hf/

## Construct artifacts and deploy to SageMaker endpoint

In [29]:
# Construct code artifacts tar
code_tarname = 'llama_2_model' # llama_2_model

!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

llama_2_model/
llama_2_model/model.py
llama_2_model/model.py.backup
llama_2_model/serving.properties
llama_2_model/requirements.txt


In [30]:
sagemaker.utils.name_from_base("tmp06/v2")

'tmp06/v2-2023-08-29-02-34-24-715'

In [31]:
# specify a inference container version, 
# form - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers
inference_image_uri = f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.8.3-cu118"
# inference_image_uri = f"768219110428.dkr.ecr.cn-northwest-1.amazonaws.com.cn/sm-tgi100-winston:latest"

# copy the code tar to 'any' valid S3 path (different from hf model artifacts), use default bucket here
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", 
                                    default_bucket, 
                                    sagemaker.utils.name_from_base("tmp06/v2"))

# name a SageMaker Endpoint
endpoint_name = sagemaker.utils.name_from_base(code_tarname.replace('_','-'))

In [32]:
inference_image_uri


'727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.8.3-cu118'

In [None]:
from sagemaker.model import Model


model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g4dn.4xlarge', 
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 2900
            )



-------------

## Wrap a predictor and request specified endpoint

In [15]:
from sagemaker import serializers, deserializers

# endpoint_name = 'llama-2-model-2023-08-28-09-26-16-994'

predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [27]:

system_prompt = """
"""

ask = """
you should use the knowledge provided to answer user's question.  
the knowledge you known are: [21] after modification.\n\nThe ABTS+ radical reaction solution configuration was as follows: 5 mL of 7 mmol/L of ABTS and 5 mL of 2.45 mmol/L of potassium persulfate were mixed and stored in the dark for 12 h. Before use, 0.1 mol/L of pH 7.4 phosphate buffer saline (PBS) was added to dilute until the OD734 value was 0.70 ± 0.02.\n\nThe sample solution was the same as that of the EPS sample solution measured by DPPH clearing ability.
question: how to config the ABTS  radical reaction  ? 
"""

def get_prompt(message: str, chat_history: list[tuple[str, str]]) -> str:
    texts = [f'[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    for user_input, response in chat_history:
        texts.append(f'{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ')
    texts.append(f'{message.strip()} [/INST]')
    return ''.join(texts)

ask = get_prompt(ask, [])
print(ask)

predictor.predict(
    {"ask": ask, "parameters": {"max_new_tokens": 300}}
)

[INST] <<SYS>>


<</SYS>>

you should use the knowledge provided to answer user's question.  
the knowledge you known are: [21] after modification.

The ABTS+ radical reaction solution configuration was as follows: 5 mL of 7 mmol/L of ABTS and 5 mL of 2.45 mmol/L of potassium persulfate were mixed and stored in the dark for 12 h. Before use, 0.1 mol/L of pH 7.4 phosphate buffer saline (PBS) was added to dilute until the OD734 value was 0.70 ± 0.02.

The sample solution was the same as that of the EPS sample solution measured by DPPH clearing ability.
question: how to config the ABTS  radical reaction  ? [/INST]


{'answer': '  Based on the information provided, the ABTS radical reaction configuration is as follows:\n1. Start with 5 mL of 7 mmol/L ABTS solution.\n2. Add 5 mL of 2.45 mmol/L potassium persulfate solution to the ABTS solution.\n3. Mix the solutions in the dark for 12 hours.\n4. Before use, dilute the reaction mixture with 0.1 mol/L pH 7.4 phosphate buffer saline (PBS) to achieve an OD734 value of 0.70 ± 0.02.\nSo, the final configuration for the ABTS radical reaction is:\nABTS solution: 5 mL of 7 mmol/L ABTS\nPotassium persulfate solution: 5 mL of 2.45 mmol/L K2S2O8\nDilution: 0.1 mol/L pH 7.4 PBS\nTime: 12 hours in the dark\nOD734 value: 0.70 ± 0.02'}

In [24]:
system_prompt = """
You are a helpful, respectful and honest assistant. you should use the knowledge provided to answer user's question.  
the knowledge you known are: [21] after modification.\n\nThe ABTS+ radical reaction solution configuration was as follows: 5 mL of 7 mmol/L of ABTS and 5 mL of 2.45 mmol/L of potassium persulfate were mixed and stored in the dark for 12 h. Before use, 0.1 mol/L of pH 7.4 phosphate buffer saline (PBS) was added to dilute until the OD734 value was 0.70 ± 0.02.\n\nThe sample solution was the same as that of the EPS sample solution measured by DPPH clearing ability.
question: how to config the ABTS  radical reaction  ? 
"""

predictor.predict(
    {"ask": system_prompt, "parameters": {"max_new_tokens": 300}}
)

{'answer': ''}

In [18]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "in order to make a good pizza, i need to ", "parameters": {"max_new_tokens": 200}}
)

6.73 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)


In [28]:
predictor.delete_model()
predictor.delete_endpoint()

In [10]:




system_prompt = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
            If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
            """


def get_prompt(message: str, chat_history: list[tuple[str, str]]) -> str:
    texts = [f'[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    for user_input, response in chat_history:
        texts.append(f'{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ')
    texts.append(f'{message.strip()} [/INST]')
    return ''.join(texts)


# prompt = system_prompt.format("who are you?")
`


In [11]:
# prompt
texts = get_prompt("how old are you?", [])
# texts
response = predictor.predict(
    {"ask": texts, "parameters": {
        "do_sample": True,
        "max_new_tokens": 100,
        "temperature": 0.5,
        # "return_text": True,
        # "return_full_text": False
    }}
)

response

# answer_text
# outputs.add_as_json({"answer": answer_text})
# outputs
    

{'answer': "As an AI language model, I don't have an age. I am a computer program that processes and generates text based on the input I receive. I don't age in the way that humans do, and I don't have a physical form. "}

In [51]:
import getattr

ModuleNotFoundError: No module named 'getattr'