# Building a simple RAG chatbot with LangChain, Hugging Face, FAISS, Amazon SageMaker and Amazon Textract

In [1]:
%%sh
pip install sagemaker langchain amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf pip install faiss-cpu -qU

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.1.1 which is incompatible.
sphinx 7.2.6 requires docutils<0.21,>=0.18.1, but you have docutils 0.16 which is incompatible.[0m[31m
[0m

In [2]:
import boto3, json, sagemaker
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Deploy LLM on SageMaker

In [3]:
#t5 XL
# Hub Model configuration. https://huggingface.co/models
role = sagemaker.get_execution_role()

hub = {
	#'HF_MODEL_ID':'google/flan-t5-small',
    'HF_MODEL_ID':'google/flan-t5-xl',
    #'HF_MODEL_ID':'google/flan-t5-xxl',
	'SM_NUM_GPUS': json.dumps(1)
}



# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.2xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="flan-t5-demo"
  )
  
# send request
predictor.predict({
	"inputs": "Translate to German:  My name is Arthur",
})

---------!

[{'generated_text': 'Ich bin Arthur.'}]

In [4]:
endpoint_name = predictor.endpoint_name
endpoint_name

'flan-t5-demo'

**Zero Shot example** 1. Ask a question to LLM without providing the context
To better illustrate why we need retrieval-augmented generation (RAG) based approach to solve the question and anwering problem. Let's directly ask the model a question and see how they respond.

In [5]:
question = "Which instances can I use with Managed Spot Training in SageMaker?"

out = predictor.predict({"inputs": question})
out

[{'generated_text': 'SageMaker and SageMaker XL.'}]

# define payload
prompt="""<|prompter|>How can i stay more active during winter? Give me 3 tips.<|endoftext|><|assistant|>"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.7,
    "temperature": 0.7,
    "top_k": 50,
    "min_length": 200,
    "stop": ["<|endoftext|>"]
  }
}

# send request to endpoint
response = predictor.predict(payload)

# print(response[0]["generated_text"][:-len("<human>:")])
print(response[0]["generated_text"])


Step 3. Improve the answer to the same question using prompt engineering with insightful context
To better answer the question well, we provide extra contextual information, combine it with a prompt, and send it to model together with the question. Below is an example.

In [10]:
context = """Managed Spot Training can be used with all instances
supported in Amazon SageMaker. Managed Spot Training is supported
in all AWS Regions where Amazon SageMaker is currently available."""

In [11]:
prompt_template = """Answer the following QUESTION based on the CONTEXT
given. #If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

text_input = prompt_template.replace("{context}", context).replace("{question}", question)

out = predictor.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {question}\n[Output]: {generated_text}")

[Input]: Which instances can I use with Managed Spot Training in SageMaker?
[Output]: all instances supported in Amazon SageMaker


Let's see if our LLM is capable of following our instructions...

In [12]:
unanswerable_question = "What color is my desk?"

text_input = prompt_template.replace("{context}", context).replace("{question}", unanswerable_question)

out = predictor.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {unanswerable_question}\n[Output]: {generated_text}")

[Input]: What color is my desk?
[Output]: I don't know


## Configure LLM in LangChain

In [13]:
model_kwargs = {"max_new_tokens": 1024,
    "top_p": 0.8, "temperature": 0.8,"max_length": 512,"min_length":300 }

In [14]:
#define content handler class
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_data = {
            "inputs": prompt,  # Adjust this field based on the expected input format
            **model_kwargs,
        }
        input_str = json.dumps(input_data)
        return input_str.encode("utf-8")
     
    def transform_output(self, output: 'StreamingBody') -> str:
            response_json = json.loads(output.read().decode("utf-8"))
            print(response_json)
            return response_json[0]["generated_text"]


content_handler = ContentHandler()


In [15]:
import boto3
sm_client = boto3.client("sagemaker-runtime") # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)



## Zero-shot example

In [16]:
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt = PromptTemplate.from_template(system_prompt + "{context}")

In [17]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [21]:
context ="Solar investments trends in China have increased by 30% each year in the last decade"
question = "What is the investment trend for solar investments in China?"

query = f"question: {question}"

In [22]:
query = f"question: {question}"
print(query)

question: What is the investment trend for solar investments in China?


In [23]:
answer = llm_chain.run({"context": context, "question": query})
print(answer)

[{'generated_text': '30%'}]
30%


## RAG example with PDF files

In [24]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader

In [25]:
#!pip install pypdf
#!pip install pypdf2

#this is working for single pdf it is created as pypdf can't load s3 object directly
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Specify your S3 bucket name and item name
bucket_name = "bo-automation"
item_name = "langchain-rag-demo/Coal2022.pdf"

# Create an S3 client
s3 = boto3.client("s3")

# Get the PDF file content from S3
response = s3.get_object(Bucket=bucket_name, Key=item_name)
pdf_content = response["Body"].read()

# Use BytesIO to create a file-like object from the PDF content
pdf_file = BytesIO(pdf_content)

# Save the contents to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file.write(pdf_content)
    temp_file_path = temp_file.name

# Create a PyPDFLoader instance and load the PDF document
loader = PyPDFLoader(temp_file_path)
docs = loader.load()
print(len(docs))

# Now you can work with the 'document' object, which represents the PDF content
# For example, you can access the pages: document.pages

# Optionally, delete the temporary file
os.remove(temp_file_path)

In [26]:
#working fine to load multiple files
import boto3
import tempfile
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
import os

# Initialize Boto3 S3 client
s3 = boto3.client('s3')
bucket_name = "bo-automation1"
prefix = "langchain-rag-demo/"

# List objects within the specified directory
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Initialize a list to hold all documents
all_docs = []

# Iterate over each file and load its contents
for obj in response.get('Contents', []):
    item_name = obj['Key']
    # Process only PDF files
    if item_name.endswith('.pdf'):
        print(f"Loading file: {item_name}")

        # Get the PDF file content from S3
        pdf_response = s3.get_object(Bucket=bucket_name, Key=item_name)
        pdf_content = pdf_response["Body"].read()

        # Use BytesIO to create a file-like object from the PDF content
        pdf_file = BytesIO(pdf_content)

        # Save the contents to a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(pdf_content)
            temp_file_path = temp_file.name

        # Load the PDF document
        loader = PyPDFLoader(temp_file_path)
        docs = loader.load()
        all_docs.extend(docs)
        print(len(docs))

        # Optionally, delete the temporary file
        os.remove(temp_file_path)

# Now all_docs contains documents from all PDF files


Loading file: langchain-rag-demo/5G_Dimensioning and Network Design Guidelines.docx.pdf
91
Loading file: langchain-rag-demo/dimensioning_guide.pdf
36


In [27]:
print(len(all_docs))

127


#not using,but working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

chunks = text_splitter.split_documents(docs)

# Print information about the chunks
print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")


In [28]:
#working fine
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)
all_chunks = []

# Assuming 'docs' is the list of loaded documents
for document in all_docs:
    # Extract text from the document
    #text = document.content  # Adjust this based on the actual structure of your Document object

    # Split the text into chunks
    chunks = text_splitter.split_documents(all_docs)

    # Add the chunks to the list
    all_chunks += chunks

    # Print information about the chunks
    print(f"Original text length: {len(all_docs)}, number of chunks: {len(chunks)}")

#chunks = text_splitter.split_documents(docs)

# Print information about the chunks
#print(f"Original text length: {len(docs)}, number of chunks: {len(chunks)}")
print(f"Number of chunks: {len(all_chunks)}")


Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length: 127, number of chunks: 745
Original text length

### Analyze documents with Amazon Textract and split them in chunks

### Embed document chunks and store them in FAISS
https://github.com/facebookresearch/faiss 

In [29]:
#from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [30]:
# Define embedding model
# See https://huggingface.co/spaces/mteb/leaderboard

embedding_model_id = "BAAI/bge-small-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [31]:
%%time
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(all_chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(all_chunks, embeddings)


Number of chunks: 94615
CPU times: user 2min 23s, sys: 2.66 s, total: 2min 25s
Wall time: 2min 5s


#not using
%%time
#alternate
# Embed chunks
#embeddings_db = FAISS.from_documents(all_chunks, embeddings)
print(f"Number of chunks: {len(chunks)}")
#print(f"Number of embeddings: {len(embeddings)}")

# Embed chunks
embeddings_db = FAISS.from_documents(chunks, embeddings)

In [34]:
# Save database
embeddings_db.save_local("faiss_index")

### Shortcut : load existing embedding database

In [35]:
embeddings_db = FAISS.load_local("faiss_index", embeddings)

********

### Configure RAG chain

In [36]:
retriever = embeddings_db.as_retriever(search_kwargs={"k": 5})

In [37]:
#working version
# Define prompt template1
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

In [38]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt_template})

### Ask our question again

In [39]:
question = "System Requirements for Cloud Container Distribution?"
answer = chain.run({"query": question})
#print(answer)

#answer = chain.run({"query": question})
print(answer)


[{'generated_text': '2 2 Dimensioning When Using Image-based Deployment 3 3 Dimensioning When '}]
2 2 Dimensioning When Using Image-based Deployment 3 3 Dimensioning When 


In [40]:
question = "How much raw capacity is available in ceph cluster?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': "I don't know"}]
I don't know


In [41]:
question = "what is the result of encryption?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': 'increases the latency and also lowers the maximum load the system can handle. Because of this,'}]
increases the latency and also lowers the maximum load the system can handle. Because of this,


In [42]:
question = "A CCSM instance can be connected to a maximum of how many HSM?"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': '6+1'}]
6+1


In [43]:
question = "what are System Requirements for Cloud Container Distributionl"
answer = chain.run({"query": question})
print(answer)

[{'generated_text': 'The generic requirements to enable installation of Cloud Container Distribution are described in Infrastructure Requirements'}]
The generic requirements to enable installation of Cloud Container Distribution are described in Infrastructure Requirements


Alternate approach

Test interface using Gradio

In [46]:
#pip install gradio
#import gradio
import pydantic

#print("Gradio version:", gradio.__version__)
print("Pydantic version:", pydantic.__version__)


Pydantic version: 1.10.14


In [47]:
pip install --upgrade pydantic gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pydantic
  Downloading pydantic-2.6.0-py3-none-any.whl.metadata (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-4.16.0-py3-none-any.whl.metadata (15 kB)
Collecting annotated-types>=0.4.0 (from pydantic)
  Downloading annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)
Collecting pydantic-core==2.16.1 (from pydantic)
  Downloading pydantic_core-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio-client==0.8.1 (from gradio)
  Downloading gradio_client-0.8.1-py3-non

In [48]:
pip install --upgrade pydantic gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [49]:
pip install gradio==3.48.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gradio==3.48.0
  Downloading gradio-3.48.0-py3-none-any.whl.metadata (17 kB)
Collecting gradio-client==0.6.1 (from gradio==3.48.0)
  Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-3.48.0-py3-none-any.whl (20.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.2/299.2 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gradio-client, gradio
  Attempting uninstall: gradio-client
    Found existing installation: gradio_client 0.8.1
    Uninstalling gradio_client-0.8.1:
      Successfully uninstalled gradio_client-0.8.1
  Attempting uninstall: gradio
    Found existing installation: gradio 4.16.0
    Uninstalling gradio-4.16.0:
      Successfully uninstalled gradio-4.16.0
Successfully instal

In [50]:
pip install --upgrade starlette

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting starlette
  Downloading starlette-0.36.1-py3-none-any.whl.metadata (5.8 kB)
Downloading starlette-0.36.1-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.4/71.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: starlette
  Attempting uninstall: starlette
    Found existing installation: starlette 0.35.1
    Uninstalling starlette-0.35.1:
      Successfully uninstalled starlette-0.35.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastapi 0.109.0 requires starlette<0.36.0,>=0.35.0, but you have starlette 0.36.1 which is incompatible.[0m[31m
[0mSuccessfully installed starlette-0.36.1
Note: you may need to restart the kernel to use updated packages.


In [53]:
import gradio as gr

In [56]:
pip install PyMuPDF

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting PyMuPDF
  Downloading PyMuPDF-1.23.20-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading PyMuPDF-1.23.20-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hDownloading PyMuPDFb-1.23.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.20 PyMuPDFb-1.23.9
Note: you may need to restart the kernel to use updated packages.


In [52]:
# Assuming the initialization of your model, prompt template, and chain is done here
# Define prompt template
system_prompt = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt_template = PromptTemplate.from_template(system_prompt + "{context}")

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt_template}
)

# Define gradio model function
def model_function(question):
    # Make prediction using the chain
    answer = chain.run({"query": question})
    return answer



In [42]:
# Define the gradio interface for our use case
interface = gr.Interface(fn=model_function, inputs="text", outputs="text")
interface.launch()
#interface.launch(share=True)



Running on local URL:  http://127.0.0.1:7860
Sagemaker notebooks may require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://4c39b1da777ebb244f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




[{'generated_text': 'The generic requirements to enable installation of Cloud Container Distribution are described in Infrastructure Requirements'}]


Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/langchain_community/llms/sagemaker_endpoint.py", line 355, in _call
    response = self.client.invoke_endpoint(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/client.py", line 553, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/botocore/client.py", line 1009, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.errorfactory.ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` must have less than 1024 tokens. Given: 1176","error_type":"validation"}". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/flan-t5-demo in a

[{'generated_text': "I don't know"}]
[{'generated_text': "I don't know"}]
[{'generated_text': 'a Telco-grade database, designed to'}]
[{'generated_text': 'a Telco-grade database, designed to'}]


In [57]:
# test gradio 2
import gradio as gr
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize your text splitter with the desired chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)

def extract_text_from_pdf(pdf_file):
    # Function to extract text from PDF
    text = ""
    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

def process_text_into_chunks(text):
    # Function to split text into chunks
    chunks = text_splitter.split_documents([text])
    return chunks

def generate_embeddings(chunks):
    # Assuming you have a function to generate embeddings for text chunks
    embeddings = [embed_text(chunk) for chunk in chunks]  # Pseudocode
    return embeddings

def model_function(pdf_file, question):
    # Process the PDF and extract text
    context = extract_text_from_pdf(pdf_file)
    
    # Split the context into chunks
    chunks = process_text_into_chunks(context)
    
    # Generate embeddings for each chunk
    embeddings = generate_embeddings(chunks)
    
    # Use the RAG model to answer the question based on the embeddings
    # This part of the code would depend on how your RAG model uses embeddings to generate an answer
    answer = rag_answer_question(question, embeddings)  # Pseudocode
    return answer

# Gradio Interface
interface = gr.Interface(
    fn=model_function, 
    inputs=[gr.File(type="file", label="Upload PDF"), gr.Textbox(label="Your Question")], 
    outputs="text"
)

interface.launch()



Running on local URL:  http://127.0.0.1:7860
Sagemaker notebooks may require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://dd4d5192428143f38b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/gradio/routes.py", line 534, in predict
    output = await route_utils.call_process_api(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/gradio/route_utils.py", line 226, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/gradio/blocks.py", line 1550, in process_api
    result = await self.call_function(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/gradio/blocks.py", line 1185, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 21

## Delete endpoint and model

predictor.delete_model()
predictor.delete_endpoint()

import boto3

sagemaker = boto3.client('sagemaker')
response = sagemaker.list_endpoints()

if not response['Endpoints']:
    print("No active endpoints.")
