 # Module 5

## RAG the hard way
Prerequisites

In [1]:
# Required imports
import boto3
import json
import PyPDF2
from pprint import pp
from opensearchpy import (
    AWSV4SignerAuth,
    OpenSearch,
    RequestsHttpConnection,
)
region = "us-east-1"

### Extract text from PDF file

In [2]:
pdf_path = "input/AnyCompany_financial_10K.pdf"
with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

### Split text into overlapping chunks

In [3]:
chunk_size=1000
overlap=200
chunks = []
start = 0
while start < len(text):
    end = start + chunk_size
    chunk = text[start:end]
    chunks.append(chunk)
    start = end - overlap

print(f"The total number of chunks is {len(chunks)}")

The total number of chunks is 287


### Function to get embedding of text using Bedrock Titan model

In [4]:
bedrock = boto3.client('bedrock-runtime', region_name=region)

def get_embedding(text):
    body = json.dumps({"inputText": text})
    
    response = bedrock.invoke_model(
        modelId="amazon.titan-embed-text-v2:0",
        body=body
    )
    return json.loads(response['body'].read())['embedding']

Test the function with the first chunk

In [5]:
print(f"The first chunk: {chunks[0]}")
embedding_of_first_chunk = get_embedding(chunks[0])
print(f"Size of the generated embedding: {len(embedding_of_first_chunk)}")
print("Embedding of first chunk:")
print(embedding_of_first_chunk)

The first chunk: EXHIBITS, FINANCIAL STATEMENT SCHEDULES including Financial Statement Schedules, Exhibits, Signatures, Power of Attorney  AnyCompany Financial's 10K report includes several financial statement schedules and exhibits that provide important information to investors and other stakeholders. These schedules and exhibits are essential components of the report, offering detailed data and insights that supplement the financial statements.  **Financial Statement Schedules**  Financial statement schedules offer additional information about specific line items in the financial statements. At AnyCompany, we provide the following financial statement schedules:  1. **Schedule of Assets:** This schedule provides a detailed breakdown of our assets, including a list of our major customers, the carrying amounts and fair values of our financial assets, and the gross and net amounts of our impaired assets.  2. **Schedule of Liabilities:** This schedule offers a comprehensive view of our l

### Index chunks with embeddings

#### Initialize OpenSearch Serverless client

In [6]:
collection_endpoint = "https://z652gy3ntfv21x4xxgn7.us-east-1.aoss.amazonaws.com"
index_name = "financial-documents"

# Create the SigV4 object for authentication with the credentials of boto3 to the 
#  service Amazon OpenSearch Serverless (aoss)
credentials = boto3.Session().get_credentials()
awsauth = AWSV4SignerAuth(credentials, region, 'aoss')

# Create the OpenSearch client to directly search in the index
opensearch_client = OpenSearch(
    hosts=[{'host': collection_endpoint.replace('https://', ''), 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize = 20
)

#### Test the indexing

In [7]:
# Create the document that will be sent to OpenSearch
doc = {
    "embedding": embedding_of_first_chunk,
    "text_chunk": chunks[0]
}

# Add the document containing the text and embedding to the index
opensearch_client.index(
    index=index_name,
    body=doc
)

{'_index': 'financial-documents',
 '_id': '1%3A0%3AX0BlnJkBVVLWMWXhbnZN',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 0, 'successful': 0, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 0}

#### Embed and index each chunk

In [8]:
# For each chunk
for chunk in chunks:
    # Generate the embedding
    embedding = get_embedding(chunk)

    # Create the document that will be sent to OpenSearch
    doc = {
        "embedding": embedding,
        "text_chunk": chunk
    }

    # Add the document containing the text and embedding to the index
    opensearch_client.index(
        index=index_name,
        body=doc
    )

### Retrieval (search)

#### Search for a single document

In [9]:
# Define the query
query = "liquid cash"

# Get the embedding of the query which returns a vector of 1024 dimensions
query_embedding = get_embedding(query)

# Retrieve 1 document
k = 1

# Craft the search body
search_body = {
    "size": k,
    "query": {
        "knn": {
            "embedding": {
                "vector": query_embedding,
                "k": k
            }
        }
    }
}

# Search for the embedding of the query
response = opensearch_client.search(index=index_name, body=search_body)

# Display the answer
print(json.dumps(response, indent=2))

{
  "took": 24,
  "timed_out": false,
  "_shards": {
    "total": 0,
    "successful": 0,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 10,
      "relation": "eq"
    },
    "max_score": 0.45532855,
    "hits": [
      {
        "_index": "financial-documents",
        "_id": "1%3A0%3AZUBpnJkBVVLWMWXhLndK",
        "_score": 0.45532855,
        "_source": {
          "embedding": [
            -0.03943469002842903,
            0.009929223917424679,
            -0.039845798164606094,
            0.002280543791130185,
            0.019847044721245766,
            0.0031611123122274876,
            -0.025029249489307404,
            0.006321708671748638,
            -0.01033308170735836,
            0.037403080612421036,
            0.030817311257123947,
            0.0020179790444672108,
            -0.022463509812951088,
            -0.0198532547801733,
            0.03999275714159012,
            0.002246496034786105,
            -0.02621987834572792,

#### Define the function to search for similar documents

In [10]:
def search(query, k=3):
    # Get the embedding of the query
    query_embedding = get_embedding(query)
    
    # Create the search query
    search_body = {
        "size": k,
        "query": {
            "knn": {
                "embedding": {
                    "vector": query_embedding,
                    "k": k
                }
            }
        }
    }

    # Search for the embedding in OpenSearch
    response = opensearch_client.search(index=index_name, body=search_body)
    
    # Loop through the results and extract the text_chunk 
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source']['text_chunk'])
    
    return results

### Augmented (augment the prompt)

In [11]:
original_prompt = "What investments have AnyCompany made?"

# Get the relevant documents (or chunks)
relevant_docs = search(original_prompt)

In [12]:
# Create document blocks
document_blocks = []
for i, doc in enumerate(relevant_docs):
    document_blocks.append({
        "document": {
            "name": f"financial_doc_{i+1}",
            "format": "txt",
            "source": {
                "bytes": doc.encode('utf-8') # Need to convert the doc to bytes
            }
        }
    })

# Craft message for Converse API
messages = [
    {
        "role": "user",
        "content": document_blocks + [
            {
                "text": f"Based on the provided documents, please answer: {original_prompt}"
            }
        ]
    }
]

# Using pprint to print the content as it contains bytes
pp(messages, indent=2, width=120)

[ { 'role': 'user',
    'content': [ { 'document': { 'name': 'financial_doc_1',
                                 'format': 'txt',
                                 'source': { 'bytes': b'ading securities portfolio was $900 million as of December 3'
                                                      b'1, 2021, representing 17% of the total investment portfolio.'
                                                      b' The trading securities portfolio had an unrealized loss of '
                                                      b"$50 million as of December 31, 2021.  AnyCompany Financial's"
                                                      b' investment portfolio includes significant investment positi'
                                                      b'ons in the following companies:  * Company A: $400 million *'
                                                      b' Company B: $350 million * Company C: $250 million  These si'
                                            

### Generation

In [13]:
# Call Converse API
response = bedrock.converse(
    modelId='us.amazon.nova-lite-v1:0',
    messages=messages
)

print(json.dumps(response, indent=2))

{
  "ResponseMetadata": {
    "RequestId": "67b2b4a8-d852-40fe-a8d9-029066c32e05",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Tue, 30 Sep 2025 21:02:44 GMT",
      "content-type": "application/json",
      "content-length": "886",
      "connection": "keep-alive",
      "x-amzn-requestid": "67b2b4a8-d852-40fe-a8d9-029066c32e05"
    },
    "RetryAttempts": 0
  },
  "output": {
    "message": {
      "role": "assistant",
      "content": [
        {
          "text": "Based on the provided documents, AnyCompany Financial has made significant investments in the following companies:\n\n1. Company A: $400 million\n2. Company B: $350 million\n3. Company C: $250 million\n\nThese significant investment positions represent 22% of the total investment portfolio. The company's investment portfolio is diversified across various sectors, including technology, healthcare, financial services, and consumer goods. The investment policy requires that no single investment position exc