In [2]:
# %%sh
# pip install sagemaker langchain amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf faiss-cpu -qU

In [3]:
import boto3, json, sagemaker
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Deploy LLM on Sagemaker

In [4]:
role = sagemaker.get_execution_role()

hub = {
	'HF_MODEL_ID':'mistralai/Mistral-7B-Instruct-v0.1',
	'SM_NUM_GPUS': '1'
}

huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role 
)

predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.2xlarge",
	container_startup_health_check_timeout=300,
  )

---------!

In [5]:
endpoint_name = predictor.endpoint_name
endpoint_name

'huggingface-pytorch-tgi-inference-2024-01-13-04-46-32-010'

## Configure LLM in LangChain

In [6]:
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

In [7]:
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps(
            # Mistral prompt, see https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
            {"inputs": f"<s>[INST] {prompt} [/INST]", "parameters": {**model_kwargs}}
        )
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        splits = response_json[0]["generated_text"].split("[/INST] ")
        return splits[1]

content_handler = ContentHandler()

In [8]:
sm_client = boto3.client("sagemaker-runtime") # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)

## Zero-shot example

In [9]:
system_prompt = """
As a helpful human resource specialist, please answer the question.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.
"""

prompt = PromptTemplate.from_template(system_prompt + "{content}")

In [10]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [11]:
question = "How many years of experience does Ednalyn C. De Dios have?"

query = f"question: {question}"

In [12]:
answer = llm_chain.run({query})
print(answer)

  warn_deprecated(


I don't have access to Ednalyn C. De Dios' personal information, so I cannot provide an answer to that question.


## RAG example with PDF files

In [13]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [14]:
# Define S3 bucket and prefix for PDF storage

bucket = "rag-langchain-demo"

In [15]:
print(bucket)

rag-langchain-demo


In [16]:
 # Build list of S3 URIs

s3 = boto3.client("s3")
objs = s3.list_objects_v2(Bucket=bucket)
objs = objs['Contents']
uris = [f's3://{bucket}/{obj["Key"]}' for obj in objs]
uris

['s3://rag-langchain-demo/1026-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf',
 's3://rag-langchain-demo/1101-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf',
 's3://rag-langchain-demo/925-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf']

##  Analyze documents with Amazon Textract and split them in chunks

In [17]:
%%time

textract_client = boto3.client('textract')
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

all_chunks = []

for uri in uris:
    loader = AmazonTextractPDFLoader(uri, client=textract_client)
    document = loader.load()
    chunks = splitter.split_documents(document)
    all_chunks += chunks
    print(f"Loaded {uri}, {len(document)} pages, {len(chunks)} chunks")

Loaded s3://rag-langchain-demo/1026-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf, 2 pages, 20 chunks
Loaded s3://rag-langchain-demo/1101-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf, 2 pages, 21 chunks
Loaded s3://rag-langchain-demo/925-ednalyn-de-dios-machine-learning-engineer-data-scientist-manager-readable-2023.pdf, 2 pages, 20 chunks
CPU times: user 772 ms, sys: 18.5 ms, total: 791 ms
Wall time: 28.3 s


## Embed document chunks and store them in FAISS

In [18]:
# Define embedding model
# See https://huggingface.co/spaces/mteb/leaderboard

embedding_model_id = "BAAI/bge-small-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)

In [19]:
%%time
# Embed chunks
embeddings_db = FAISS.from_documents(all_chunks, embeddings)

CPU times: user 2.39 s, sys: 136 ms, total: 2.52 s
Wall time: 2.93 s


In [20]:
# Save database
embeddings_db.save_local("faiss_index")

## Shortcut : load existing embedding database

In [21]:
embeddings_db = FAISS.load_local("faiss_index", embeddings)

## Configure RAG chain

In [22]:
retriever = embeddings_db.as_retriever(search_kwargs={"k": 1})

In [23]:
# Define prompt template
prompt_template = """
As a helpful human resource specialist, please answer the question.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.

question: {question}

context: {context}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [24]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt})

## Ask our question again

In [25]:
question = "How many years of experience does Ednalyn C. De Dios have?"
answer = chain.run({"query": question})
print(answer)

Based on the information provided, Ednalyn C. De Dios has over 10 years of management experience. However, I don't have access to her full career history or education, so I can't provide a specific number of years of experience.


## Delete endpoint and model

In [26]:
predictor.delete_model()
predictor.delete_endpoint()