In [None]:
!pip install PyMuPDF

In [None]:
import fitz

In [None]:
# Function to extract text using PyMuPDF
def extract_text_from_pdf_mupdf(pdf_path):
    text = ""
    document = fitz.open(pdf_path)
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

In [None]:
# Extract the text from the PDF using PyMuPDF
pdf_text_mupdf = extract_text_from_pdf_mupdf("../lab-data/BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF")
pdf_text_mupdf[:2000]  # Displaying the first 2000 characters to get an overview of the content

In [None]:
def chunk_text(text, chunk_size, overlap):
    """
    Chunk text into smaller segments with a specified chunk size and overlap.

    Parameters:
    - text (str): The text to be chunked.
    - chunk_size (int): The size of each chunk.
    - overlap (int): The number of characters that overlap between chunks.

    Returns:
    - List[str]: A list of text chunks.
    """
    if chunk_size <= overlap:
        raise ValueError("Chunk size must be greater than overlap")

    chunks = []
    start = 0
    end = chunk_size

    while start < len(text):
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
        end = start + chunk_size

    return chunks

In [None]:
chunks = chunk_text(pdf_text_mupdf, 1024, 256)
print(len(chunks))
chunks[:2]

In [None]:
import boto3
import json

In [None]:
# Initialize the Bedrock client
client = boto3.client('bedrock-runtime', region_name='us-west-2')

In [None]:
# Function to embed text
def embed_text(input_text):
    # Create the request payload
    payload = {
        "inputText": input_text,
        "dimensions": 512,  # Specify the desired dimension size
        "normalize": True  # Whether to normalize the output embeddings
    }

    # Invoke the model
    response = client.invoke_model(
        body=json.dumps(payload),
        modelId='amazon.titan-embed-text-v2:0',  # Specify the Titan embedding model
        accept='application/json',
        contentType='application/json'
    )

    # Get the embedding result
    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')
    return embedding

# Print the embedding
print(embed_text(chunks[0]))

In [None]:
!pip install pyepsilla

In [None]:
!sh ../setup.sh

In [None]:
from pyepsilla import vectordb
## connect to vectordb
db = vectordb.Client(
  host='localhost',
  port='8888'
)

In [None]:
db.unload_db("kdd_lab1_rag")
db.load_db(db_name="kdd_lab1_rag", db_path="/tmp/kdd_lab1_rag")

In [None]:
db.use_db(db_name="kdd_lab1_rag")
db.create_table(
  table_name="NaiveRAG",
  table_fields=[
    {"name": "ID", "dataType": "INT", "primaryKey": True},
    {"name": "Doc", "dataType": "STRING"},
    {"name": "Embedding", "dataType": "VECTOR_FLOAT", "dimensions": 512}
  ]
)

In [None]:
records = [
    {
        "ID": index,
        "Doc": text,
        "Embedding": embed_text(text)
    }
    for index, text in enumerate(chunks)
]
records[:2]

In [None]:
db.insert("NaiveRAG", records)

In [None]:
def generate(prompt):
    # Create the request payload
    payload = {
        "prompt": prompt,
        "temperature": 0,  # Adjust the randomness of the output
        "max_gen_len": 128
    }

    # Initialize the Bedrock runtime client
    client = boto3.client('bedrock-runtime', region_name='us-west-2')

    # Invoke the model
    response = client.invoke_model(
        modelId='meta.llama3-8b-instruct-v1:0',
        contentType='application/json',
        accept='application/json',
        body=json.dumps(payload)
    )
    
    byte_response = response['body'].read()
    json_string = byte_response.decode('utf-8')

    # Get the chat response
    response_body = json.loads(json_string)
    chat_response = response_body.get('generation')

    return chat_response

# Example usage
input_text = "How are you?"
prompt = f"""
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

{input_text}[/INST]
"""

print(generate(prompt))

In [None]:
def basic_retriever(question, top_k):
    code, resp = db.query(
        table_name="NaiveRAG",
        query_field="Embedding",
        query_vector=embed_text(question),
        limit=top_k
    )
    return resp["result"]
basic_retriever("What's the agreement date?", 5)

In [None]:
def naive_rag(question):
    docs = basic_retriever(question, 5)
    docs_str = "------------------------\n"
    for doc in docs:
        docs_str += doc["Doc"] + "------------------------\n"
    prompt = f"""
<s>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.

Your answer should be grounded by the information provided in the documents below.
Don't make up answers.
Don't explain your thought process.
Directly answer the question in concise way.

<documents>
{docs_str}
</documents>

{question}
"""
    return generate(prompt)

naive_rag("What's the agreement date?")