# Building a simple RAG chatbot with LangChain, Hugging Face, FAISS, Amazon SageMaker and Amazon Textract

In [10]:
%%sh
pip install sagemaker langchain amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf pip install faiss-cpu -qU

In [9]:
import boto3, json, sagemaker
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

## Deploy LLM on SageMaker

#t5 XL
# Hub Model configuration. https://huggingface.co/models
role = sagemaker.get_execution_role()

hub = {
	#'HF_MODEL_ID':'google/flan-t5-small',
    'HF_MODEL_ID':'google/flan-t5-xl',
	'SM_NUM_GPUS': json.dumps(1)
}



# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.4xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="flan-t5-demo"
  )
  
# send request
predictor.predict({
	"inputs": "Translate to German:  My name is Arthur",
})

In [13]:
#not used
# Hub Model configuration. https://huggingface.co/models
#llama
role = sagemaker.get_execution_role()

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Llama-2-7b-chat-hf',
	'SM_NUM_GPUS': json.dumps(1),
	'HUGGING_FACE_HUB_TOKEN': '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'
}

#assert hub['HUGGING_FACE_HUB_TOKEN'] != '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', "You have to provide a token."

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.2xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="llama-7b-demo"
  )
  
# send request
#predictor.predict({
#	"inputs": "My name is Julien and I like to",
#})

----------------*

UnexpectedStatusException: Error hosting endpoint llama-7b-demo: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

In [48]:
#not run here,but create a new text file and rename as inference.py and copying code below
# inference.py
%%writefile inference.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

def model_fn(model_dir):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", torch_dtype=torch.float16).to(device)
    model.eval()
    return {"tokenizer": tokenizer, "model": model, "device": device}

def input_fn(request_body, request_content_type):
    assert request_content_type == 'application/json'
    input_data = json.loads(request_body)
    return input_data

def predict_fn(input_data, model_artifacts):
    tokenizer = model_artifacts['tokenizer']
    model = model_artifacts['model']
    device = model_artifacts['device']
    
    input_text = input_data['inputs']
    encoded_input = tokenizer(input_text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output_ids = model.generate(**encoded_input, max_length=512)
    return output_ids

def output_fn(prediction_output, accept_content_type):
    assert accept_content_type == 'application/json'
    tokenizer = model_artifacts['tokenizer']
    decoded_output = tokenizer.decode(prediction_output[0], skip_special_tokens=True)
    return json.dumps({"generated_text": decoded_output})



UsageError: Line magic function `%%writefile` not found.


In [34]:
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

role = sagemaker.get_execution_role()
huggingface_model = HuggingFaceModel(
    env={"HF_MODEL_ID": "mistralai/Mistral-7B-Instruct-v0.2", "HF_TASK": "text-generation"},
    role=role,
    transformers_version="4.6.1",
    pytorch_version="1.7.1",
    py_version="py36",
    entry_point="inference.py"  # Your custom inference script
)

# Deploy the model
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.2xlarge",
    endpoint_name="mistral-7b-demo"
)

------!

In [35]:
endpoint_name = predictor.endpoint_name
endpoint_name

'mistral-7b-demo'

In [44]:
!pwd

/home/ec2-user/SageMaker


In [49]:
!ls -l

total 1084
-rw-rw-r-- 1 ec2-user ec2-user 117820 Jan  3 12:12 Building a simple RAG chatbot (1)-Copy1.ipynb
-rw-rw-r-- 1 ec2-user ec2-user 215593 Jan  9 09:01 clean slate-Copy1.ipynb
-rw-rw-r-- 1 ec2-user ec2-user  85124 Feb 12 11:07 clean slate_FLAN-01feb.ipynb
-rw-rw-r-- 1 ec2-user ec2-user  97953 Feb 24 05:53 clean slate_FLAN.ipynb
-rw-rw-r-- 1 ec2-user ec2-user 152954 Feb 25 15:22 clean slate_other.ipynb
drwxrwxr-x 2 ec2-user ec2-user   4096 Nov 27 07:14 faiss_index
drwxrwxr-x 2 ec2-user ec2-user   4096 Jan 16 10:21 flagged
-rw-rw-r-- 1 ec2-user ec2-user  65244 Jan  9 09:26 Fresh RAG.ipynb
-rw-rw-r-- 1 ec2-user ec2-user   1895 Feb 25 15:11 inference.py
drwx------ 2 root     root      16384 Nov 21 11:05 lost+found
-rw-rw-r-- 1 ec2-user ec2-user 105972 Jan  6 11:21 simple RAG copy2-Copy1.ipynb
-rw-rw-r-- 1 ec2-user ec2-user 120702 Feb  4 07:35 SOC-app-Copy1.ipynb
-rw-rw-r-- 1 ec2-user ec2-user  75416 Feb 12 10:19 SOC-app.ipynb
-rw-rw-r-- 1 ec2-user ec2-user  31939 Jan  7 16:22 stream

**Zero Shot example** 1. Ask a question to LLM without providing the context
To better illustrate why we need retrieval-augmented generation (RAG) based approach to solve the question and anwering problem. Let's directly ask the model a question and see how they respond.

Step 3. Improve the answer to the same question using prompt engineering with insightful context
To better answer the question well, we provide extra contextual information, combine it with a prompt, and send it to model together with the question. Below is an example.

Let's see if our LLM is capable of following our instructions...

## Configure LLM in LangChain

In [36]:
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

In [37]:
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps(
            # Mistral prompt, see https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
            {"inputs": f"<s>[INST] {prompt} [/INST]", "parameters": {**model_kwargs}}
        )
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        splits = response_json[0]["generated_text"].split("[/INST] ")
        return splits[1]

content_handler = ContentHandler()

In [38]:
import boto3
sm_client = boto3.client("sagemaker-runtime") # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)



## Zero-shot example

In [39]:
system_prompt = """
As a helpful energy specialist, please answer the question, focusing on numerical data.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.
"""

prompt = PromptTemplate.from_template(system_prompt + "{content}")

In [40]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [41]:
question = "What is the latest trend for solar investments in China?"

query = f"question: {question}"

In [42]:
answer = llm_chain.run({query})
print(answer)

ValueError: Error raised by inference endpoint: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "\u0027mistral\u0027"
}
". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/mistral-7b-demo in account 254455524940 for more information.

In [52]:
# Assuming llm_chain and PromptTemplate are properly defined and imported
system_prompt = """
As a helpful energy specialist, please answer the question, focusing on numerical data.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.
"""

# Note the use of {content} as a placeholder in your prompt
prompt_template = PromptTemplate.from_template(system_prompt + "{content}")

# Initialize LLMChain with the prompt template
llm_chain = LLMChain(llm=llm, prompt=prompt_template)

# The question you want to ask
question = "What is the latest trend for solar investments in China?"

# Form the query as a dictionary where the key matches the placeholder in the template
query = {"content": question}

# Run the query through the LLMChain
answer = llm_chain.run(query)

# Print the answer
print(answer)


ValueError: Error raised by inference endpoint: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "\u0027mistral\u0027"
}
". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/mistral-7b-demo in account 254455524940 for more information.

## RAG example with PDF files

In [17]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader

In [18]:
#!pip install pypdf
#!pip install pypdf2

#this is working for single pdf it is created as pypdf can't load s3 object directly
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Specify your S3 bucket name and item name
bucket_name = "bo-automation"
item_name = "langchain-rag-demo/Coal2022.pdf"

# Create an S3 client
s3 = boto3.client("s3")

# Get the PDF file content from S3
response = s3.get_object(Bucket=bucket_name, Key=item_name)
pdf_content = response["Body"].read()

# Use BytesIO to create a file-like object from the PDF content
pdf_file = BytesIO(pdf_content)

# Save the contents to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file.write(pdf_content)
    temp_file_path = temp_file.name

# Create a PyPDFLoader instance and load the PDF document
loader = PyPDFLoader(temp_file_path)
docs = loader.load()
print(len(docs))

# Now you can work with the 'document' object, which represents the PDF content
# For example, you can access the pages: document.pages

# Optionally, delete the temporary file
os.remove(temp_file_path)

In [46]:
#working fine to load multiple files
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Initialize Boto3 S3 client
s3 = boto3.client('s3')
bucket_name = "bo-automation1"
prefix = "langchain-rag-demo/"

# List objects within the specified directory
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Initialize a list to hold all documents
all_docs = []

# Iterate over each file and load its contents
for obj in response.get('Contents', []):
    item_name = obj['Key']
    # Process only PDF files
    if item_name.endswith('.pdf'):
        print(f"Loading file: {item_name}")

        # Get the PDF file content from S3
        pdf_response = s3.get_object(Bucket=bucket_name, Key=item_name)
        pdf_content = pdf_response["Body"].read()

        # Use BytesIO to create a file-like object from the PDF content
        pdf_file = BytesIO(pdf_content)

        # Save the contents to a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(pdf_content)
            temp_file_path = temp_file.name

        # Load the PDF document
        loader = PyPDFLoader(temp_file_path)
        docs = loader.load()
        all_docs.extend(docs)
        print(len(docs))

        # Optionally, delete the temporary file
        os.remove(temp_file_path)

# Now all_docs contains documents from all PDF files


Loading file: langchain-rag-demo/5G_Dimensioning and Network Design Guidelines.docx.pdf
91
Loading file: langchain-rag-demo/dimensioning_guide.pdf
36


In [57]:
print(len(all_docs))

127


#not using,but working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

chunks = text_splitter.split_documents(docs)

# Print information about the chunks
print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")


In [59]:
#working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)
all_chunks = []

# Assuming 'docs' is the list of loaded documents
for document in all_docs:
    # Extract text from the document
    #text = document.content  # Adjust this based on the actual structure of your Document object

    # Split the text into chunks
    chunks = text_splitter.split_documents(all_docs)

    # Add the chunks to the list
    all_chunks += chunks

    # Print information about the chunks
    print(f"Original text length: {len(all_docs)}, number of chunks: {len(chunks)}")

#chunks = text_splitter.split_documents(docs)

# Print information about the chunks
#print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")
print(f"Number of chunks: {len(all_chunks)}")


Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chunks: 745
Original text length: 36, number of chun

### Analyze documents with Amazon Textract and split them in chunks

### Embed document chunks and store them in FAISS
https://github.com/facebookresearch/faiss 

In [52]:
#from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [22]:
# Define embedding model
# See https://huggingface.co/spaces/mteb/leaderboard

embedding_model_id = "BAAI/bge-small-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [60]:
%%time
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(all_chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(all_chunks, embeddings)


Number of chunks: 94615
CPU times: user 2min 32s, sys: 5.26 s, total: 2min 38s
Wall time: 2min


#not using
%%time
#alternate
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(chunks, embeddings)

In [61]:
# Save database
embeddings_db.save_local("faiss_index")

### Shortcut : load existing embedding database

In [62]:
embeddings_db = FAISS.load_local("faiss_index", embeddings)

********

### Configure RAG chain

In [63]:
retriever = embeddings_db.as_retriever(search_kwargs={"k": 5})

In [65]:
#working version
# Define prompt template1
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

In [66]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt_template})

### Ask our question again

In [67]:
question = "System Requirements for Cloud Container Distribution?"
answer = chain.run({"query": question})
#print(answer)

#answer = chain.run({"query": question})
print(answer)


[{'generated_text': '2 2 Dimensioning When Using Image-based Deployment 3 3 Dimensioning When '}]
2 2 Dimensioning When Using Image-based Deployment 3 3 Dimensioning When 


In [68]:
question = "How much raw capacity is available in ceph cluster?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': "I don't know"}]
I don't know


In [69]:
question = "what is the result of encryption?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': 'increases the latency and also lowers the maximum load the system can handle. Because of this,'}]
increases the latency and also lowers the maximum load the system can handle. Because of this,


In [75]:
question = "A CCSM instance can be connected to a maximum of how many HSM?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': '6+1'}]
6+1


Alternate approach

Test interface using Gradio

In [33]:
#pip install gradio
#import gradio
import pydantic

#print("Gradio version:", gradio.__version__)
print("Pydantic version:", pydantic.__version__)


Pydantic version: 1.10.13


In [34]:
#pip install --upgrade pydantic gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pydantic
  Downloading pydantic-2.5.3-py3-none-any.whl.metadata (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.6/65.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-4.14.0-py3-none-any.whl.metadata (15 kB)
Collecting annotated-types>=0.4.0 (from pydantic)
  Downloading annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)
Collecting pydantic-core==2.14.6 (from pydantic)
  Downloading pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio-client==0.8.0 (from gradio)
  Downloading gradio_client-0.8.0-py3-non

In [35]:
#pip install --upgrade pydantic gradio

In [36]:
#pip install gradio==3.48.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gradio==3.48.0
  Downloading gradio-3.48.0-py3-none-any.whl.metadata (17 kB)
Collecting gradio-client==0.6.1 (from gradio==3.48.0)
  Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-3.48.0-py3-none-any.whl (20.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.2/299.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installation: gradio_client 0.8.0
    Uninstalling gradio_client-0.8.0:
      Successfully uninstalled gradio_client-0.8.0
  Attempting uninstall: gradio
    Found existing installation: gradio 4.14.0
    Uninstalling gradio-4.14.0:
      Successfully uninstalled gradio-4.14.0
Successful

In [37]:
#pip install --upgrade starlette

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [38]:
import gradio as gr

Matplotlib is building the font cache; this may take a moment.


In [76]:
# Assuming the initialization of your model, prompt template, and chain is done here
# Define prompt template
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt_template}
)

# Define gradio model function
def model_function(question):
    # Make prediction using the chain
    answer = chain.run({"query": question})
    return answer



In [77]:
# Define the gradio interface for our use case
interface = gr.Interface(fn=model_function, inputs="text", outputs="text")
interface.launch()
#interface.launch(share=True)



Running on local URL:  http://127.0.0.1:7865
Sagemaker notebooks may require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://f01377da92ab18bc7e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




[{'generated_text': '10%'}]


## Delete endpoint and model

predictor.delete_model()
predictor.delete_endpoint()

import boto3

sagemaker = boto3.client('sagemaker')
response = sagemaker.list_endpoints()

if not response['Endpoints']:
    print("No active endpoints.")
