# Building a simple RAG chatbot with LangChain, Hugging Face, FAISS, Amazon SageMaker and Amazon Textract

In [1]:
%%sh
pip install sagemaker langchain amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf pip install faiss-cpu -qU

In [2]:
import boto3, json, sagemaker
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Deploy LLM on SageMaker

In [3]:
#t5 XL
# Hub Model configuration. https://huggingface.co/models
role = sagemaker.get_execution_role()

hub = {
	#'HF_MODEL_ID':'google/flan-t5-small',
    'HF_MODEL_ID':'google/flan-t5-xl',
	'SM_NUM_GPUS': json.dumps(1)
}



# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.4xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="flan-t5-demo"
  )
  
# send request
predictor.predict({
	"inputs": "Translate to German:  My name is Arthur",
})

---------!

[{'generated_text': 'Ich bin Arthur.'}]

In [4]:
endpoint_name = predictor.endpoint_name
endpoint_name

'flan-t5-demo'

**Zero Shot example** 1. Ask a question to LLM without providing the context
To better illustrate why we need retrieval-augmented generation (RAG) based approach to solve the question and anwering problem. Let's directly ask the model a question and see how they respond.

In [5]:
question = "Which instances can I use with Managed Spot Training in SageMaker?"

out = predictor.predict({"inputs": question})
out

[{'generated_text': 'SageMaker and SageMaker XL.'}]

Step 3. Improve the answer to the same question using prompt engineering with insightful context
To better answer the question well, we provide extra contextual information, combine it with a prompt, and send it to model together with the question. Below is an example.

In [6]:
context = """Managed Spot Training can be used with all instances
supported in Amazon SageMaker. Managed Spot Training is supported
in all AWS Regions where Amazon SageMaker is currently available."""

In [7]:
prompt_template = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

text_input = prompt_template.replace("{context}", context).replace("{question}", question)

out = predictor.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {question}\n[Output]: {generated_text}")

[Input]: Which instances can I use with Managed Spot Training in SageMaker?
[Output]: all instances supported in Amazon SageMaker


Let's see if our LLM is capable of following our instructions...

In [8]:
unanswerable_question = "What color is my desk?"

text_input = prompt_template.replace("{context}", context).replace("{question}", unanswerable_question)

out = predictor.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {unanswerable_question}\n[Output]: {generated_text}")

[Input]: What color is my desk?
[Output]: I don't know


## Configure LLM in LangChain

In [9]:
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

In [10]:
#define content handler class
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_data = {
            "inputs": prompt,  # Adjust this field based on the expected input format
            **model_kwargs,
        }
        input_str = json.dumps(input_data)
        return input_str.encode("utf-8")
     
    def transform_output(self, output: 'StreamingBody') -> str:
            response_json = json.loads(output.read().decode("utf-8"))
            print(response_json)
            return response_json[0]["generated_text"]


content_handler = ContentHandler()


In [11]:
import boto3
sm_client = boto3.client("sagemaker-runtime") # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)



## Zero-shot example

In [12]:
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt = PromptTemplate.from_template(system_prompt + "{context}")

In [13]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [14]:
context ="Solar investments trends in China have increased by 30% each year in the last decade"
question = "What is the latest trend for solar investments in China?"

query = f"question: {question}"

In [15]:
query = f"question: {question}"
print(query)

question: What is the latest trend for solar investments in China?


In [16]:
answer = llm_chain.run({"context": context, "question": query})
print(answer)

  warn_deprecated(


[{'generated_text': "I don't know"}]
I don't know


In [28]:
#not using
from typing import Dict
import json
from langchain_community.llms.sagemaker_endpoint import SagemakerEndpoint, LLMContentHandler
#from langchain.chains import PromptTemplate, LLMChain

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_data = {
            "inputs": prompt,  # Adjust this field based on the expected input format
            **model_kwargs,
        }
        input_str = json.dumps(input_data)
        return input_str.encode("utf-8")

    def transform_output(self, output: 'StreamingBody') -> str:
            response_json = json.loads(output.read().decode("utf-8"))
            print(response_json)
            return response_json[0]["generated_text"]

content_handler = ContentHandler()

import boto3

# Replace 'endpoint_name' and 'model_kwargs' with your actual values
endpoint_name = "flan-t5-demo"
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

sm_client = boto3.client("sagemaker-runtime")  # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)

system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

llm_chain = LLMChain(llm=llm, prompt=prompt_template)

context = "Solar investments in China have increased by 30% each year in the last decade"
question = "What is the latest trend for solar investments in China?"

query = f"question: {question}"
print(query)

answer = llm_chain.run({"context": context, "question": query})
print(answer)


question: What is the latest trend for solar investments in China?
[{'generated_text': "I don't know"}]
I don't know


In [36]:
import json
import boto3

# Create a SageMaker client
sagemaker_runtime = boto3.client('sagemaker-runtime')  # Replace 'your-region' with your AWS region

# Define the input data
text_input = {
    "inputs": "Answer the following QUESTION based on the CONTEXT\ngiven. If you do not know the answer and the CONTEXT doesn't\ncontain the answer truthfully say \"I don't know\".\n\nCONTEXT:\nSolar investments in China have increased by 30% each year in the last decade\n\nQUESTION:\nWhat is the latest trend for solar investments in China?\n\nANSWER:\nSolar investments in China have increased by 30% each year in the last decade",
    "max_new_tokens": 512,
    "top_p": 0.8,
    "temperature": 0.8
}

# Call the SageMaker endpoint
try:
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName='flan-t5-demo',
        ContentType='application/json',
        Body=json.dumps(text_input)
    )

    result = json.loads(response['Body'].read().decode())
    print(result)

except Exception as e:
    print(f"Error raised by inference endpoint: {e}")


[{'generated_text': "I don't know"}]


## RAG example with PDF files

In [22]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader

In [44]:
#!pip install pypdf
#!pip install pypdf2

In [23]:
#this is working ,it is created as pypdf can't load s3 object directly
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Specify your S3 bucket name and item name
bucket_name = "bo-automation"
item_name = "langchain-rag-demo/Coal2022.pdf"

# Create an S3 client
s3 = boto3.client("s3")

# Get the PDF file content from S3
response = s3.get_object(Bucket=bucket_name, Key=item_name)
pdf_content = response["Body"].read()

# Use BytesIO to create a file-like object from the PDF content
pdf_file = BytesIO(pdf_content)

# Save the contents to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file.write(pdf_content)
    temp_file_path = temp_file.name

# Create a PyPDFLoader instance and load the PDF document
loader = PyPDFLoader(temp_file_path)
docs = loader.load()
print(len(docs))

# Now you can work with the 'document' object, which represents the PDF content
# For example, you can access the pages: document.pages

# Optionally, delete the temporary file
os.remove(temp_file_path)

137


In [27]:
#working fine to load multiple files
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Initialize Boto3 S3 client
s3 = boto3.client('s3')
bucket_name = "bo-automation"
prefix = "langchain-rag-demo/"

# List objects within the specified directory
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Initialize a list to hold all documents
all_docs = []

# Iterate over each file and load its contents
for obj in response.get('Contents', []):
    item_name = obj['Key']
    # Process only PDF files
    if item_name.endswith('.pdf'):
        print(f"Loading file: {item_name}")

        # Get the PDF file content from S3
        pdf_response = s3.get_object(Bucket=bucket_name, Key=item_name)
        pdf_content = pdf_response["Body"].read()

        # Use BytesIO to create a file-like object from the PDF content
        pdf_file = BytesIO(pdf_content)

        # Save the contents to a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(pdf_content)
            temp_file_path = temp_file.name

        # Load the PDF document
        loader = PyPDFLoader(temp_file_path)
        docs = loader.load()
        all_docs.extend(docs)
        print(len(docs))

        # Optionally, delete the temporary file
        os.remove(temp_file_path)

# Now all_docs contains documents from all PDF files


Loading file: langchain-rag-demo/Coal2022.pdf
137
Loading file: langchain-rag-demo/WorldEnergyInvestment2023.pdf
181
Loading file: langchain-rag-demo/WorldEnergyOutlook2023.pdf
355


In [42]:
#not using,but working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

chunks = text_splitter.split_documents(docs)

# Print information about the chunks
print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")


Original text length: 355, number of chunks: 4654


In [44]:
#working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)
all_chunks = []

# Assuming 'docs' is the list of loaded documents
for document in docs:
    # Extract text from the document
    #text = document.content  # Adjust this based on the actual structure of your Document object

    # Split the text into chunks
    chunks = text_splitter.split_documents(docs)

    # Add the chunks to the list
    all_chunks += chunks

    # Print information about the chunks
    print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")

#chunks = text_splitter.split_documents(docs)

# Print information about the chunks
print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")


Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654
Original text length: 355, number of chunks: 4654


In [132]:
#not using
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Specify the chunk size and overlap
chunk_size = 180
chunk_overlap = 0

# Initialize the splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

all_chunks = []

# Assuming 'docs' is the list of loaded documents
for document in docs:
    # Extract text from the document
    #text = document.content  # Adjust this based on the actual structure of your Document object

    # Split the text into chunks
    chunks = splitter.split_documents(docs)

    # Add the chunks to the list
    all_chunks += chunks

    # Print information about the chunks
    print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")


Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701
Original text length: 137, number of chunks: 1701


In [77]:
#not using
from langchain.document_loaders import DirectoryLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma, AtlasDB, FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import S3FileLoader

loader = S3FileLoader("s3://bo-automation/langchain-rag-demo/", show_progress=True)
data = loader.load()

# Chunk our documents into smaller sizes for better responses
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks=text_splitter.split_documents(data)
# Use FAISS to create a vector index from our doc chunks and embeddings FM
db = FAISS.from_documents(chunks, embeddings)
db.save_local("csm_chunks_index") #save vector DB to file
# Chunk our documents into smaller sizes for better responses
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks=text_splitter.split_documents(data)
# Use FAISS to create a vector index from our doc chunks and embeddings FM
db = FAISS.from_documents(chunks, embeddings)
db.save_local("csm_chunks_index") #save vector DB to file

FileNotFoundError: Directory not found: 's3://bo-automation/langchain-rag-demo/'

In [79]:
#not using
from langchain.document_loaders import DirectoryLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma, AtlasDB, FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

# S3 bucket and prefix
bucket = "bo-automation"
prefix = "langchain-rag-demo"

# Load documents from S3
loader = S3Loader(bucket=bucket, prefix=prefix)
data = loader.load()

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)

# Use embeddings (replace 'your_model_name' with the actual model name)
embeddings = HuggingFaceEmbeddings("your_model_name")
# Embed the chunks
embeddings.embed_documents(chunks)

# Use FAISS to create a vector index from the embedded chunks
db = FAISS.from_documents(chunks)
db.save_local("csm_chunks_index")  # Save vector DB to file


NameError: name 'S3Loader' is not defined

### Analyze documents with Amazon Textract and split them in chunks

In [48]:
#not using
import boto3
from PyPDF2 import PdfReader
from io import BytesIO

# Assuming you have the list of URIs
uris = ["s3://bo-automation/langchain-rag-demo/Coal2022.pdf","s3://bo-automation/langchain-rag-demo/WorldEnergyInvestment2023.pdf", "s3://bo-automation/langchain-rag-demo/WorldEnergyOutlook2023.pdf"]

for uri in uris:
    print(f"Loading {uri}")

    # Extract bucket name and object key from the URI
    uri_parts = uri.split("/")
    bucket_name = uri_parts[2]
    item_name = "/".join(uri_parts[3:])

    # Load the PDF using s3fs and PyPDF2
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket_name, item_name)
    fs = obj.get()['Body'].read()
    pdf = PdfReader(BytesIO(fs))

    # Extract text using PyPDF2
    text = ""
    for page_num in range(len(pdf.pages)):
        page = pdf.pages[page_num]
        text += page.extract_text()

    # Process the extracted text as needed
    print(f"Loaded {uri}, text length: {len(text)}")
    #print(text)


ModuleNotFoundError: No module named 'PyPDF2'

Alternate approach

In [69]:
#not using
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [70]:
#not using
# Define S3 bucket and prefix for PDF storage

bucket = "bo-automation"
prefix = "langchain-rag-demo"

In [71]:
# Build list of S3 URIs
#not using

s3 = boto3.client("s3")
objs = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
objs = objs['Contents']
uris = [f's3://{bucket}/{obj["Key"]}' for obj in objs]
uris    

['s3://bo-automation/langchain-rag-demo/',
 's3://bo-automation/langchain-rag-demo/Coal2022.pdf',
 's3://bo-automation/langchain-rag-demo/WorldEnergyInvestment2023.pdf',
 's3://bo-automation/langchain-rag-demo/WorldEnergyOutlook2023.pdf']

In [72]:
#not using
%%time

textract_client = boto3.client('textract')
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

all_chunks = []

for uri in uris:
    loader = AmazonTextractPDFLoader(uri, client=textract_client)
    document = loader.load()
    chunks = splitter.split_documents(document)
    all_chunks += chunks
    print(f"Loaded {uri}, {len(document)} pages, {len(chunks)} chunks")

KeyError: 'DocumentMetadata'

In [75]:
#not using
import boto3
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define S3 bucket and prefix for PDF storage
bucket = "bo-automation"
prefix = "langchain-rag-demo"

# Build list of S3 URIs
s3 = boto3.client("s3")
objs = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
objs = objs['Contents']
uris = [f's3://{bucket}/{obj["Key"]}' for obj in objs]

textract_client = boto3.client('textract')
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

all_chunks = []

for uri in uris:
    loader = AmazonTextractPDFLoader(uri, client=textract_client)
    document = loader.load()
    
    # Debugging: Print the entire Textract response
    print("Textract Response:", document.raw_response)

    chunks = splitter.split_documents(document)
    all_chunks += chunks
    print(f"Loaded {uri}, {len(document)} pages, {len(chunks)} chunks")


KeyError: 'DocumentMetadata'

### Embed document chunks and store them in FAISS
https://github.com/facebookresearch/faiss 

In [28]:
#from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [29]:
# Define embedding model
# See https://huggingface.co/spaces/mteb/leaderboard

embedding_model_id = "BAAI/bge-small-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [45]:
%%time
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(all_chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(all_chunks, embeddings)


Number of chunks: 1652170


KeyboardInterrupt: 

In [31]:
#not using
%%time
#alternate
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(chunks, embeddings)

Number of chunks: 1241
CPU times: user 10.8 s, sys: 220 ms, total: 11 s
Wall time: 19.8 s


In [33]:
# Save database
embeddings_db.save_local("faiss_index")

### Shortcut : load existing embedding database

In [34]:
embeddings_db = FAISS.load_local("faiss_index", embeddings)

********

### Configure RAG chain

In [35]:
retriever = embeddings_db.as_retriever(search_kwargs={"k": 5})

In [36]:
# Define prompt template
prompt_template = """
As a helpful energy specialist, please answer the question below, focusing on numerical data and using only the context below.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.

question: {question}

context: {context}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [37]:
#working version
# Define prompt template1
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

In [38]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt_template})

### Ask our question again

In [41]:
question = "outlook for Global coal consumption in 2025?"
answer = chain.run({"query": question})
#print(answer)

#answer = chain.run({"query": question})
print(answer)


ValueError: Error raised by inference endpoint: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 4025 `inputs` tokens and 20 `max_new_tokens`","error_type":"validation"}". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/flan-t5-demo in account 254455524940 for more information.

In [40]:
question = "coal production?"
answer = chain.run({"query": question})
print(answer)

ValueError: Error raised by inference endpoint: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 4485 `inputs` tokens and 20 `max_new_tokens`","error_type":"validation"}". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/flan-t5-demo in account 254455524940 for more information.

Alternate approach

In [66]:
def rag_query(question: str) -> str:
    # create query vec
    query_vec = embed_docs(question)[0]
    # query pinecone
    res = index.query(query_vec, top_k=5, include_metadata=True)
    # get contexts
    contexts = [match.metadata['text'] for match in res.matches]
    # build the multiple contexts string
    context_str = construct_context(contexts=contexts)
    # create our retrieval augmented prompt
    text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)
    # make prediction
    out = llm.predict({"inputs": text_input})
    return out[0]["generated_text"]

In [67]:
rag_query("What does STEPS mean?")

NameError: name 'index' is not defined

## Delete endpoint and model

In [1]:
predictor.delete_model()
predictor.delete_endpoint()

NameError: name 'predictor' is not defined

In [151]:
import boto3

sagemaker = boto3.client('sagemaker')
response = sagemaker.list_endpoints()

if not response['Endpoints']:
    print("No active endpoints.")
