## Understanding and Coding the Self-Attention Mechanism of Large Language Models From Scratch

> https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html

In [20]:
import torch

# Embedding an Input Sentence
sentence = 'Life is short, eat dessert first'

dc = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}
print(dc)

sentence_int = torch.tensor([dc[s] for s in sentence.replace(',', '').split()])
print(sentence_int)

torch.manual_seed(123)
embed = torch.nn.Embedding(6, 16)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)
print(embedded_sentence.shape)

print(embedded_sentence[1])
print(embedded_sentence[1].shape)


# Defining the Weight Matrices
torch.manual_seed(123)

d = embedded_sentence.shape[1]

d_q, d_k, d_v = 24, 24, 28

W_query = torch.nn.Parameter(torch.rand(d_q, d))
W_key = torch.nn.Parameter(torch.rand(d_k, d))
W_value = torch.nn.Parameter(torch.rand(d_v, d))


# Computing the Unnormalized Attention Weights
x_2 = embedded_sentence[1]
query_2 = W_query.matmul(x_2)
key_2 = W_key.matmul(x_2)
value_2 = W_value.matmul(x_2)

print(query_2.shape)
print(key_2.shape)
print(value_2.shape)

keys = W_key.matmul(embedded_sentence.T).T
values = W_value.matmul(embedded_sentence.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

omega_24 = query_2.dot(keys[4])
print(omega_24)

omega_2 = query_2.matmul(keys.T)
print(omega_2)


# Computing the Attention Scores
import torch.nn.functional as F

attention_weights_2 = F.softmax(omega_2 / d_k**0.5, dim=0)
print(attention_weights_2)

context_vector_2 = attention_weights_2.matmul(values)

print(context_vector_2.shape)
print(context_vector_2)

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}
tensor([0, 4, 5, 2, 1, 3])
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
          0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692],
        [ 0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,
          0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465],
        [ 0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,
         -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729],
        [-1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,
          0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850],
        [-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,
          0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400],
        [ 0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,
        

## Multi-Head Attention

In [16]:
h = 3
multihead_W_query = torch.nn.Parameter(torch.rand(h, d_q, d))
multihead_W_key = torch.nn.Parameter(torch.rand(h, d_k, d))
multihead_W_value = torch.nn.Parameter(torch.rand(h, d_v, d))

multihead_query_2 = multihead_W_query.matmul(x_2)
print(multihead_query_2.shape)

multihead_key_2 = multihead_W_key.matmul(x_2)
multihead_value_2 = multihead_W_value.matmul(x_2)

stacked_inputs = embedded_sentence.T.repeat(3, 1, 1)
print(stacked_inputs.shape)

multihead_keys = torch.bmm(multihead_W_key, stacked_inputs)
multihead_values = torch.bmm(multihead_W_value, stacked_inputs)
print("multihead_keys.shape:", multihead_keys.shape)
print("multihead_values.shape:", multihead_values.shape)

multihead_keys = multihead_keys.permute(0, 2, 1)
multihead_values = multihead_values.permute(0, 2, 1)
print("multihead_keys.shape:", multihead_keys.shape)
print("multihead_values.shape:", multihead_values.shape)

torch.Size([3, 24])
torch.Size([3, 16, 6])
multihead_keys.shape: torch.Size([3, 24, 6])
multihead_values.shape: torch.Size([3, 28, 6])
multihead_keys.shape: torch.Size([3, 6, 24])
multihead_values.shape: torch.Size([3, 6, 28])


## Cross-Attention

In [19]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
print("embedded_sentence.shape:", embedded_sentence.shape)

d_q, d_k, d_v = 24, 24, 28

W_query = torch.rand(d_q, d)
W_key = torch.rand(d_k, d)
W_value = torch.rand(d_v, d)

x_2 = embedded_sentence[1]
query_2 = W_query.matmul(x_2)
print("query.shape", query_2.shape)

keys = W_key.matmul(embedded_sentence.T).T
values = W_value.matmul(embedded_sentence.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

embedded_sentence_2 = torch.rand(8, 16) # 2nd input sequence

keys = W_key.matmul(embedded_sentence_2.T).T
values = W_value.matmul(embedded_sentence_2.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

embedded_sentence.shape: torch.Size([6, 16])
query.shape torch.Size([24])
keys.shape: torch.Size([6, 24])
values.shape: torch.Size([6, 28])
keys.shape: torch.Size([8, 24])
values.shape: torch.Size([8, 28])


# RAG on AWS

> https://aws.amazon.com/blogs/machine-learning/evaluate-rag-responses-with-amazon-bedrock-llamaindex-and-ragas/

> https://github.com/aws-samples/sample-rag-evaluation-ragas/blob/main/ragas_notebook.ipynb

> https://aws.amazon.com/blogs/machine-learning/from-concept-to-reality-navigating-the-journey-of-rag-from-proof-of-concept-to-production/


This post guides you through the process of assessing quality of RAG response with evaluation framework such as RAGAS and LlamaIndex with Amazon Bedrock.

In this post, we are also going to leverage Langchain to create a sample RAG application.

Amazon `Bedrock` is a fully managed service that offers a choice of high-performing Foundation Models (FMs) from leading AI companies like AI21 Labs, Anthropic, Cohere, Meta, Stability AI, and Amazon via a single API, along with a broad set of capabilities you need to build generative AI applications with security, privacy, and responsible AI.

The Retrieval Augmented Generation Assessment (`RAGAS`) framework offers multiple metrics to evaluate each part of the RAG system pipeline, identifying areas for improvement. It utilizes foundation models to test individual components, aiding in pinpointing modules for development to enhance overall results.

`LlamaIndex` is a framework for building LLM applications. It simplifies data integration from various sources and provides tools for data indexing, engines, agents, and application integrations. Optimized for search and retrieval, it streamlines querying LLMs and retrieving documents. This blog post focuses on using its Observability/Evaluation modules.

`LangChain` is an open-source framework that simplifies the creation of applications powered by foundation models. It provides tools for chaining LLM operations, managing context, and integrating external data sources. LangChain is primarily used for building chatbots, question-answering systems, and other AI-driven applications that require complex language processing capabilities.

The solution consists of the following components:

1. `Evaluation dataset` – The source data for the RAG comes from the Amazon SageMaker FAQ, which represents 170 question-answer pairs. This corresponds to Step 1 in the architecture diagram.
2. `Build sample RAG` – Documents are segmented into chunks and stored in an Amazon Bedrock Knowledge Bases (Steps 2–4). We use Langchain Retrieval Q&A to answer user queries. This process retrieves relevant data from an index at runtime and passes it to the Foundation Model (FM).
3. `RAG evaluation` – To assess the quality of the Retrieval-Augmented Generation (RAG) solution, we can use both RAGAS and LlamaIndex. An LLM performs the evaluation by comparing its predictions with ground truths (Steps 5–6).

### Prerequisites
To implement this solution, you need the following:

An AWS account with privileges to create AWS Identity and Access Management (`IAM`) roles and policies. For more information, see Overview of access management: Permissions and policies.
Access enabled for the `Amazon Titan Embeddings G1 – Text model and Anthropic Claude 3 Sonnet` on `Amazon Bedrock`. For instructions, see Model access.
Run the prerequisite code provided in the Python

### Evaluation of RAG with RAGAS

Evaluating the RAG solution requires to compare LLM predictions with ground truth answers. To do so, we use the batch() function from LangChain to perform inference on all questions inside our evaluation dataset.

Then we can use the evaluate() function from RAGAS to perform evaluation on each metric (answer relevancy, faithfulness and answer corectness). It uses an LLM to compute metrics. Feel free to use other Metrics from RAGAS.

### Evaluation of RAG with LlamaIndex
LlamaIndex, similar to Ragas, provides a comprehensive RAG (Retrieval-Augmented Generation) evaluation module. This module offers a variety of metrics to assess the performance of your RAG system. The evaluation process generates two key outputs:

1. `Feedback`: The judge LLM (Language Model) provides detailed evaluation feedback in the form of a string, offering qualitative insights into the system’s performance.
2. `Score`: This numerical value indicates how well the answer meets the evaluation criteria. The scoring system varies depending on the specific metric being evaluated. For example, metrics like Answer Relevancy and Faithfulness are typically scored on a scale from 0 to 1.
These outputs allow for both qualitative and quantitative assessment of your RAG system’s performance, enabling you to identify areas for improvement and track progress over time.

### Chunking
1. Standard chunking

Amazon Bedrock supports the following standard approaches to chunking:

Fixed-size chunking: You can configure the desired chunk size by specifying the number of tokens per chunk, and an overlap percentage, providing flexibility to align with your specific requirements. You can set the maximum number of tokens that must not exceed for a chunk and the overlap percentage between consecutive chunks.

Default chunking: Splits content into text chunks of approximately 300 tokens. The chunking process honors sentence boundaries, ensuring that complete sentences are preserved within each chunk.

2. Hierarchical chunking

Hierarchical chunking involves organizing information into nested structures of child and parent chunks. When creating a data source, you are able to define the parent chunk size, child chunk size and the number of tokens overlapping between each chunk. During retrieval, the system initially retrieves child chunks, but replaces them with broader parent chunks so as to provide the model with more comprehensive context.

3. Semantic chunking

Semantic chunking is a natural language processing technique that divides text into meaningful chunks to enhance understanding and information retrieval. It aims to improve retrieval accuracy by focusing on the semantic content rather than just syntactic structure. By doing so, it may facilitate more precise extraction and manipulation of relevant information.

> https://docs.aws.amazon.com/bedrock/latest/userguide/kb-chunking.html#kb-hiearchical-chunking

### RAG evaluation concepts and metrics
> https://aws.amazon.com/blogs/machine-learning/evaluate-the-reliability-of-retrieval-augmented-generation-applications-using-amazon-bedrock/

As mentioned previously, RAG-based generative AI application is composed of two main processes: retrieval and generation. Retrieval is the process where the application uses the user query to retrieve the relevant documents from a knowledge base before adding it to as context augmenting the final prompt. Generation is the process of generating the final response from the LLM. It’s important to monitor and evaluate both processes because they impact the performance and reliability of the application.

Evaluating RAG systems at scale requires an automated approach to extract metrics that are quantitative indicators of its reliability. Generally, the metrics to look for are grouped by main RAG components or by domains. Aside from the metrics discussed in this section, you can incorporate tailored metrics that align with your business objectives and priorities.

1. `Retrieval metrics`
You can use the following retrieval metrics:
    - `Context relevance` – This measures whether the passages or chunks retrieved by the RAG system are relevant for answering the given query, without including extraneous or irrelevant details. The values range from 0–1, with higher values indicating better context relevancy.
    - `Context recall` – This measures the alignment between the context and the expected RAG output, the ground truth. Similar to faithfulness, each statement in the ground truth is checked to see if it is attributed to the context (thereby evaluating the context).
    - `Context precision` – This evaluates the relevancy of the context to the answer, or in other words, the retriever’s ability to capture the best context to answer your query. An LLM verifies if the information in the given context is directly relevant to the question with a single “Yes” or “No” response. The context is passed in as a list, so if the list is size one (one chunk), then the metric for context precision is either 0 (representing the context isn’t relevant to the question) or 1 (representing that it is relevant). If the context list is greater than one (or includes multiple chunks), then context precision is between 0–1, representing a specific weighted average precision calculation. This involves the context precision of the first chunk being weighted heavier than the second chunk, which itself is weighted heavier than the third chunk, and onwards, taking into account the ordering of the chunks being outputted as contexts.

2. `Generation metrics`
You can use the following generation metrics:
    - `Faithfulness` – This measures the factual consistency of the generated answer against the given context, so it requires the answer and retrieved context as an input. This is a two-step prompt where the generated answer is first broken down into multiple standalone statements and propositions. Then, the evaluation LLM validates the attribution of the generated statement to the context. If the attribution can’t be validated, it’s assumed that the statement is at risk of hallucination. The answer is scaled to a 0–1 range; the higher the better.
    - `Answer relevance` – This focuses on how pertinent the generated RAG output (answer) is to the question. A lower score is assigned to answers that are incomplete or contain redundant information. To calculate this score, the LLM is asked to generate multiple questions from a given answer. Then using an Amazon Titan Embeddings model, embeddings are generated for the generated question and the actual question. The metric therefore is the mean cosine similarity between all the generated questions and the actual question.
    - `Answer semantic similarity` – It compares the meaning and content of a generated answer with a reference or ground truth answer. It evaluates how closely the generated answer matches the intended meaning of the ground truth answer. The score ranges from 0–1, with higher scores indicating greater semantic similarity between the two answers. A score of 1 means that the generated answer conveys the same meaning as the ground truth answer, whereas a score of 0 suggests that the two answers have completely different meanings. This assesses the semantic similarity between the RAG output (answer) and expected answer (ground truth), with a range between 0–1. A higher score signifies better performance. First, the embeddings of answer and ground truth are created, and then a score between 0–1 is predicted, representing the semantic similarity of the embeddings using a cross encoder Tiny BERT model.
    - `Answer correctness` – This is the accuracy between the generated answer and the ground truth. This is calculated from the semantic similarity metric between the answer and the ground truth in addition to a factual similarity by looking at the context. A threshold value is used if you want to employ a binary 0 or 1 answer correctness score, otherwise a value between 0–1 is generated.

3. `Aspects evaluation`
Aspects are evaluated as follows:
    - `Harmfulness` (Yes, No) – If the generated answer carries the risk of causing harm to people, communities, or more broadly to society
    - `Maliciousness` (Yes, No) – If the submission intends to harm, deceive, or exploit users
    - `Coherence` (Yes, No) – If the generated answer presents ideas, information, or arguments in a logical and organized manner
    - `Correctness` (Yes, No) – If the generated answer is factually accurate and free from errors
    - `Conciseness` (Yes, No) – If the submission conveys information or ideas clearly and efficiently, without unnecessary or redundant details

`Generator quality` can be assessed through several key metrics. `Context utilization` examines how effectively the generator uses relevant information from the provided source material. `Noise sensitivity` gauges the generator’s propensity to include inaccurate details from the retrieved content. `Hallucination` measures the extent to which the generator produces incorrect claims not present in the source data. `Self-knowledge` reflects the proportion of accurate statements generated that can’t be found in the retrieved chunks. Finally, `faithfulness` evaluates how closely the generator’s output aligns with the information contained in the source material.

For measuring the `overall generation quality`, the key metrics include measuring the `precision`, `recall`, and `answer similarity`. `Precision` suggests the proportion of the correct claims in model’s response, whereas `recall` suggests the proportion of the ground truth claims covered by the model’s response. `Answer similarity` compares the meaning and content of a generated answer with a reference or ground truth answer. It evaluates how closely the generated answer matches the intended meaning of the ground truth answer.

Establishing a feedback loop with an evaluation framework against these quality metrics allows for continuous improvement, where the system can learn from user interactions and refine its performance over time. By optimizing these quality metrics, the RAG system can be designed to deliver reliable, cost-effective, and high-performing results for users.

### Responsible AI
Implementing responsible AI practices is crucial for maintaining ethical and safe deployment of RAG systems. This includes using guardrails to filter harmful content, deny certain topics, mask sensitive information, and ground responses in verified sources to reduce hallucinations.

You can use Amazon Bedrock Guardrails for implementing responsible AI policies. Along with protecting against toxicity and harmful content, it can also be used for Automated Reasoning checks, which helps you protect against hallucinations.

### Cost and latency
Cost considers the compute resources and infrastructure required to run the system, and latency evaluates the response times experienced by end-users. To `optimize cost and latency`, implement `caching strategies` to reduce the need for expensive model inferences. Efficient `query batching` can also improve overall `throughput` and reduce resource usage. Balance performance and resource usage to find the ideal configuration that meets your application’s requirements.

### Hosting and scaling
When it comes to hosting your web application or service, there are several approaches to consider. The key is to choose a solution that can effectively host your database and compute infrastructure. This could include server-based options like Amazon Elastic Compute Cloud (Amazon EC2), managed services like Amazon Relational Database Service (Amazon RDS) and Amazon DynamoDB, or serverless approaches such as AWS Amplify and Amazon Elastic Container Service (Amazon ECS). For a practical approach to building an automated AI assistant using Amazon ECS, see Develop a fully automated chat-based assistant by using Amazon Bedrock agents and knowledge bases.

In addition to the server or compute layer, you will also need to consider an orchestration tool, testing environments, and a continuous integration and delivery (CI/CD) pipeline to streamline your application deployment. Having a feedback loop established based on the quality metrics along with a CI/CD pipeline is an important first step to creating self-healing architectures.

As your application grows, you will need to make sure your infrastructure can scale to meet the increasing demand. This can involve containerization with Docker or choosing serverless options, implementing load balancing, setting up auto scaling, and choosing between on-premises, cloud, or hybrid solutions. It also includes unique scaling requirements of your frontend application and backend generative AI workflow, as well as the use of content delivery networks (CDNs) and disaster recovery and backup strategies.

The following is a sample architecture for a secure and scalable RAG-based web application. This architecture uses Amazon ECS for hosting the service, Amazon CloudFront as a CDN, AWS WAF as a firewall, and Amazon MemoryDB for providing a semantic cache.

![title](pic/POC-TO-Productin-Blog-Architecture.jpeg)

### Data privacy, security, and observability
Maintaining data privacy and security is of utmost importance. This includes implementing security measures at each layer of your application, from encrypting data in transit to setting up robust authentication and authorization controls. It also involves focusing on compute and storage security, as well as network security. Compliance with relevant regulations and regular security audits are essential. Securing your generative AI system is another crucial aspect. By default, Amazon Bedrock Knowledge Bases encrypts the traffic using AWS managed AWS Key Management Service (AWS KMS) keys. You can also choose customer managed KMS keys for more control over encryption keys. For more information on application security, refer to Safeguard a generative AI travel agent with prompt engineering and Amazon Bedrock Guardrails.

Comprehensive logging, monitoring, and maintenance are crucial to maintaining a healthy infrastructure. This includes setting up structured logging, centralized log management, real-time monitoring, and strategies for system updates and migrations.

By addressing these critical areas, you can build a secure and resilient infrastructure to support your growing web application or service. Stay tuned for more in-depth coverage of these topics in upcoming blog posts.

By using purpose-built tools like Amazon Bedrock Knowledge Bases to streamline the end-to-end RAG workflow, organizations can successfully transition their RAG-powered proofs of concept into high-performing, cost-effective, secure production-ready solutions that deliver business value.

![title](pic/multilingual_bedrock_figure1.png)

The workflow includes the following steps:
1. The user sends a prompt for querying the documents to the REST API.
2. Amazon API Gateway sends the user prompt as an event to a Lambda function.
3. The Lambda function invokes Amazon Bedrock API and sends the prompt to the Anthropic Claude 3 Sonnet model.
4. The LLM parses the prompt and does a similarity search with vector embeddings.
5. Enhanced context from the knowledge base is used to generate a text response.
6. The final text response is returned by Amazon Bedrock to the Lambda function.
7. The user receives the final response through RESI API.

This is an event-driven architecture composed of individual AWS services that are loosely integrated with each other, with each service handling a specific function. It uses AWS serverless technologies, allowing you to build and run your application without having to manage your own servers. All server management is done by AWS, providing many benefits such as automatic scaling and built-in high availability, letting you take your idea to production quickly.

1. `Data Store Seleciton (Decision Criteria)`

A. Data Type & Structure
  - Unstructured Data (e.g., images, videos): Use S3.
  - Key-Value Pairs (e.g., user profiles): Use DynamoDB or ElastiCache.
  - Relational Data (e.g., orders, transactions): Use Aurora or RDS.
  - Time-Series (e.g., IoT sensor data): Use Timestream or Keyspaces.
  - Vector Embeddings (RAG): Use OpenSearch (k-NN plugin) or Aurora (pgvector).

B. Access Patterns
High Read/Write Throughput:
  - Use DynamoDB (up to 100K+ requests/sec) or ElastiCache (for caching).
  - Low Latency (Microseconds):
  - Use ElastiCache (Redis).
  - Complex Queries (Joins, Aggregations):
  - Use Aurora or Redshift (analytics).

C. Scalability
  - Auto-Scaling: DynamoDB (on-demand mode) or S3 (infinite storage).
  - Manual Scaling: RDS/Aurora (requires read replicas) or ElastiCache (resize clusters).

D. Cost
  - Lowest Storage Cost: S3 (e.g., $0.023/GB for Standard Tier).
  - Pay-Per-Request: DynamoDB On-Demand or Keyspaces.
  - In-Memory Performance: ElastiCache (higher cost but ultra-fast).

When to Use Each Service
  - Amazon S3
    Use Cases: Static website hosting, data lakes, ML training datasets.
    Backup/archival (with S3 Glacier).
  - DynamoDB
    Use Cases: Serverless apps, real-time dashboards, session stores. Metadata storage for S3 objects (e.g., file attributes).
  - ElastiCache (Redis)
    Use Cases: Caching LLM responses in RAG to reduce latency. Real-time leaderboards, rate limiting.
  - Amazon Aurora
    Use Cases: Transactional systems requiring SQL joins. RAG with structured metadata + vector search (via pgvector).

Hybrid Architectures

Combine services for optimal performance: 
  - S3 + DynamoDB: Store large files in S3, metadata in DynamoDB.
  - ElastiCache + Aurora: Cache frequent SQL query results in Redis.
  - OpenSearch + S3: Index documents in OpenSearch, store raw files in S3.

`S3 (documents) → OpenSearch (vector index) → ElastiCache (cache) → DynamoDB (user metadata)`

Key Takeaways
  - Start Simple: Use DynamoDB for serverless apps, S3 for files.
  - Prioritize Latency: Use ElastiCache for microsecond responses.

RAG-Specific:
  - Vector search: OpenSearch or Aurora (pgvector).
  - Raw documents: S3.
  - Caching: ElastiCache.
  - Cost Optimization: Use S3 Intelligent-Tiering for unpredictable access.
    Enable DynamoDB Auto-Scaling to avoid overprovisioning.



2. `EKS, ECS`
- Why Use ECR in RAG?
ECR serves as the backbone for securely storing and managing container images required for RAG components (e.g., retrieval services, LLM APIs, vector databases).
Key Pros:

Tight AWS Integration: Seamlessly works with ECS/EKS for image deployment and IAM-based access control, ensuring secure image pulls 29.

Security & Compliance: Built-in vulnerability scanning and encryption (at rest/in transit) for container images, critical for AI/ML workloads handling sensitive data 36.

Lifecycle Management: Automatically clean unused images to reduce costs and maintain efficiency 39.

High Availability: Replicates images across AWS Availability Zones, ensuring reliability for production-grade RAG systems 9.

Cons:

Cost for Large Repositories: Storage and data-transfer fees can add up for teams with frequent image updates or large datasets 310.

Limited IPv6 Support: May require workarounds if IPv6 is essential for your network setup 10.

- Why Use ECS in RAG?
ECS simplifies deploying and scaling containerized RAG components (e.g., retrieval APIs, generative models).
Key Pros:

Serverless with Fargate: Avoid managing servers, reducing operational overhead for bursty RAG workloads 14.

AWS Service Integration: Directly integrates with ALB, CloudWatch, and RDS for streamlined monitoring and database connectivity 19.

Simplified Scaling: Auto Scaling and load balancing ensure dynamic resource allocation for fluctuating query demands 49.

Cost Efficiency: No control-plane fees (unlike EKS), making it cheaper for smaller teams 4.

Cons:

Limited Flexibility: Less control over infrastructure compared to Kubernetes (e.g., no native pod-level resource sharing) 14.

Vendor Lock-in: Tight coupling with AWS services complicates multi-cloud or hybrid deployments 17.

- Combined Benefits of ECR + ECS in RAG
Streamlined Workflow: Push RAG component images to ECR and deploy via ECS tasks/services without managing external registries 29.

Security Synergy: IAM roles in ECS securely pull images from ECR, reducing credential exposure risks 210.

Scalability: ECS auto-scaling pairs with ECR’s high-throughput image distribution to handle traffic spikes 49.

- Implementation Example
ECR Setup:

Create a private repository for RAG components (e.g., rag-llm-api).

Enable scan-on-push and lifecycle policies to manage image versions 39.

ECS Deployment:

Define ECS tasks referencing ECR images for retrieval and generation services.

Use Fargate for serverless scaling or EC2 for GPU-optimized LLM inference 17.

CI/CD Integration:

Automate image builds/pushes to ECR via GitHub Actions and deploy updates to ECS 7.

3. `platform for different IA provider`
4. `Metrics`
5. `Retriever and Reranker (bi-encoder, cross-encoder)`

from IPython.display import display, HTML
display(HTML("""
<div style="display: flex;">
    <img src="pic/cross_encoder.png" style="width:50%; margin:5px;">
    <img src="pic/rerank.png" style="width:50%; margin:5px;">
</div>
"""))

<!-- ![title](pic/cross_encoder.png) -->
<!-- <img src="pic/cross_encoder.png" alt="title" style="width:50%; height:auto;"> -->

<!-- ![title](pic/rerank.png) -->
<!-- <img src="pic/rerank.png" alt="title" style="width:50%; height:auto;"> -->

> https://osanseviero.github.io/hackerllama/blog/posts/sentence_embeddings2/

Sentence Transformers supports two types of models: Bi-encoders and Cross-encoders. Bi-encoders are faster and more scalable, but cross-encoders are more accurate. Although both tackle similar high-level tasks, when to use one versus the other is quite different. Bi-encoders are better for search, and cross-encoders are better for classification and high-accuracy ranking. Let’s dive into the details!

Bi-encoders are models that encode the input text into a fixed-length vector. When you compute the similarity between two sentences, we usually encode the two sentences into two vectors and then compute the similarity between the two vectors (e.g., by using cosine similarity). We train bi-encoders to optimize the increase in the similarity between the query and relevant sentences and decrease the similarity between the query and the other sentences. This is why bi-encoders are better suited for search. As the previous blog post showed, bi-encoders are fast and easily scalable. If multiple sentences are provided, the bi-encoder will encode each sentence independently. This means that the sentence embeddings are independent of each other. This is a good thing for search, as we can encode millions of sentences in parallel. However, this also means that the bi-encoder doesn’t know anything about the relationship between the sentences.

When we use cross-encoders, we do something different. Cross-encoders encode the two sentences simultaneously and then output a classification score. The figure below shows the high-level differences.

Why would you use one versus the other? Cross-encoders are slower and more memory intensive but also much more accurate. A cross-encoder is an excellent choice to compare a few dozen sentences. If you want to compare hundreds of thousands of sentences, a bi-encoder is a better choice, as otherwise a cross-encoder could take multiple hours. What if you care about accuracy and want to compare thousands of sentences efficiently? This is a typical case when you want to retrieve information. In those cases, an option is first to use a bi-encoder to reduce the number of candidates (i.e., get the top 20 most relevant examples) and then use a cross-encoder to get the final result. This is called re-ranking and is a common technique in information retrieval

As mentioned, cross-encoders encode two texts simultaneously and then output a classification label. The cross-encoder first generates a single embedding that captures representations and their relationships. Compared to bi-encoder-generated embeddings (which are independent of each other), cross-encoder embeddings are dependent on each other. This is why cross-encoders are better suited for classification, and their quality is higher: they can capture the relationship between the two sentences! On the flip side, cross-encoders are slow if you need to compare thousands of sentences since they need to encode all the sentence pairs.

Let’s say you have four sentences, and you need to compare all the possible pairs:

A bi-encoder would need to encode each sentence independently, so it would need to encode four sentences.
A cross-encoder would need to encode all the possible pairs, so it would need to encode six sentences (AB, AC, AD, BC, BD, CD).
Let’s scale this. Let’s say you have 100,000 sentences, and you need to compare all the possible pairs:

A bi-encoder would encode 100,000 sentences.
A cross-encoder would encode 4,999,950,000 pairs! (Using the combinations formula: n! / (r!(n-r)!), where n=100,000 and r=2). No wonder they don’t scale well!
Hence, it makes sense they are slower!


6. `ParentDocumentRetriever, parent_splitter and child_splitter`

7. `Airflow VS. MLflow`

`Airflow DAG → MLflow Model → FastAPI (LLM) → OpenSearch (Retrieval)`

1. Apache Airflow in RAG
Purpose:
Orchestrate and automate data/ML pipelines (e.g., document ingestion, embedding updates, model retraining).

Use Cases:
Schedule periodic data ingestion from S3/APIs into your vector DB.

Retrain embedding models or update FAISS/OpenSearch indices.

Monitor and restart failed pipeline components (e.g., broken API calls to LLMs).


Key Features:
Directed Acyclic Graphs (DAGs) for workflow dependencies.

Integrations: AWS Lambda, ECS, Kubernetes, OpenSearch.

Monitoring: Built-in UI for task status, retries, and logs.

2. MLflow in RAG
Purpose:
Manage the machine learning lifecycle (experiment tracking, model registry, deployment).

Use Cases:
Track experiments with different LLMs/embedding models.

Version and deploy fine-tuned models (e.g., custom sentence-transformers).

Compare retrieval performance (e.g., recall@k for vector search).

Key Features:
Experiment Tracking: Log parameters, metrics, and artifacts.

Model Registry: Stage models (Staging/Production) with versioning.

Deployment: Serve models via REST API or batch inference.

4. How They Complement Each Other
Combined RAG Pipeline:
Airflow schedules daily tasks:

Ingests new documents → generates embeddings → updates OpenSearch.

Triggers model retraining if data drift is detected.

MLflow manages the ML side:

Tracks embedding model performance during retraining.

Deploys the best model to an API endpoint for real-time inference.

5. When to Use Each
Use Airflow If:

You need to automate multi-step workflows (e.g., ingest → embed → index).

Your RAG system requires cron-like scheduling and error handling.

Use MLflow If:

You’re experimenting with multiple LLMs/embedding models.

You need version control for models and reproducibility.

6. Final Recommendation
For a production RAG system:

Use Airflow to automate data/retrieval pipelines.

Use MLflow to track/model embedding models and LLMs.

Combine both for end-to-end traceability (e.g., Airflow triggers MLflow runs).

In [None]:
[User Request] → CloudFront → API Gateway (WAF) → ALB → EKS (Orchestrator)
               ↗         ↘                ↖       ↓
           Lambda (Caching)          SQS (Priority Queues)
               ↓                        ↘
[VPC] → SageMaker (BGE Retriever) → OpenSearch (k-NN) → DAX Cache
               ↓                          ↖
[VPC] → SageMaker (Llama3 Generator)      DynamoDB (Feedback)
               ↓                          ↗
           CloudWatch/X-Ray         Step Functions (Human Eval)


[User Request] → CloudFront (Global Cache)
               → API Gateway (Rate Limiting)
               → AWS Lambda (Orchestrator)
               → SQS (Decouple Retriever & Generator)
               ↗                        ↘
[Retriever (SageMaker + OpenSearch)]  [Generator (SageMaker LLM)]
               ↖                        ↙
[S3 (10B PDFs)] → Glue (Catalog) → Textract/EC2 Batch (PDF Processing)


User Query:

Request → CloudFront (cached responses) → API Gateway → ALB → EKS (orchestrator).

Retrieval:

EKS calls BGE endpoint → OpenSearch → DAX (cache hit/miss).

Generation:

EKS sends top-3 chunks + query to Llama3 endpoint.

Evaluation:

Answer logged in S3 → Bedrock evaluation → Human feedback via Ground Truth.

Retraining:

Low-quality triggers trigger SageMaker Pipelines to fine-tune BGE/Llama3.

`A NAT Gateway (Network Address Translation Gateway)` is a managed AWS service that allows instances in a private subnet to connect to the internet (e.g., for updates, API calls) while blocking unsolicited inbound traffic. It acts as a bridge between private subnets and the public internet. Outbound-Only Internet Access
Private instances (e.g., databases, backend servers) can initiate requests to the internet.
External entities cannot initiate connections to these instances.

1. Secure Infrastructure
VPC Design:

Deploy SageMaker, OpenSearch, EKS, and RDS in a multi-AZ VPC with private subnets.

Use NAT Gateways for outbound traffic; no public IPs for backend services.

Encryption:

Encrypt all data (S3, OpenSearch, EBS) with AWS KMS (customer-managed keys).

Use TLS 1.3 for data in transit via API Gateway and ALB.

Network Security:

AWS WAF on API Gateway to block SQLi/abusive requests.

PrivateLink for SageMaker, OpenSearch, and S3 to avoid public internet exposure.

1. Data Ingestion & Preprocessing
Storage:

Store 10B PDFs in S3 with versioning and lifecycle policies (e.g., tier to Glacier for archived docs).

Use S3 Batch Operations to process large volumes.

PDF Extraction:

Use Amazon Textract with AWS Batch (EC2 Spot Fleet) for parallel text extraction. For complex legal formats, deploy custom OCR containers on ECS/EKS.

Chunking & Embeddings:

Use AWS Glue to catalog extracted text.

Chunk text into 512-token segments using Lambda or AWS Fargate.

Generate embeddings with a SageMaker Batch Transform job (e.g., BAAI/bge-large-en model).

Vector Database:

Use Amazon OpenSearch Serverless (vector engine) for scalable storage of embeddings. For extreme scale, use Cassandra + Vector Plugin on EC2.

Encrypt data with AWS KMS.

In [None]:
pulumi-rag/
├── Pulumi.yaml        # Project config
├── __main__.py        # Main stack (Python)
└── components/
    ├── networking.py  # VPC components
    └── llm.py         # SageMaker resources


# components/opensearch.py
import pulumi
from pulumi_aws import opensearch

class VectorStore:
    def __init__(self, name: str, vpc_id: str):
        self.collection = opensearch.ServerlessCollection(
            f"{name}-vectors",
            name=f"{name}-vectors",
            type="VECTORSEARCH"
        )
        
        # Security policy with KMS encryption
        opensearch.ServerlessSecurityPolicy(
            f"{name}-policy",
            policy=json.dumps({
                "Rules": [{
                    "Resource": [self.collection.arn],
                    "ResourceType": "collection"
                }],
                "AWSOwnedKey": True
            }),
            type="encryption"
        )

# Dynamic scaling based on input
for model in ["llama3-70b", "bge-large"]:
    sagemaker.Model(
        f"rag-{model}",
        instance_type="ml.g5.2xlarge" if "bge" in model else "ml.p4d.24xlarge"
    )

# Type hints and autocomplete
from pulumi_aws.sagemaker import Endpoint

endpoint = Endpoint(
    "rag-endpoint",
    config_name=config.name,
    tags={
        "Project": "Legal-RAG"
    }
)

# Cross-region DB replica
from pulumi_aws import rds

rds.Instance(
    "replica",
    replicate_source_db=primary_db.identifier,
    availability_zone="us-west-2a"
)

# 8. Monitoring as Code

from pulumi_aws import cloudwatch

cloudwatch.MetricAlarm(
    "high_latency",
    metric_name="ModelLatency",
    threshold=500,
    comparison_operator="GreaterThanThreshold"
)

## 1. System Architecture
The RAG pipeline consists of:

Embedding Model (e.g., BAAI/bge-small-en)
→ Converts queries/documents into vectors.

Vector Database (e.g., Pinecone, FAISS, AWS OpenSearch)
→ Stores & retrieves relevant chunks.

Generation Model (e.g., Llama 2, GPT-3.5)
→ Generates answers using retrieved context.

```mermaid
graph LR
    A[User Query] --> B[Embedding Model]
    B --> C[Vector DB Retrieval]
    C --> D[Generation Model]
    D --> E[Final Answer]
```

## 2. Deploying Models on SageMaker

In [None]:
# (A) Deploy Embedding Model

from sagemaker.huggingface import HuggingFaceModel
import sagemaker

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# Deploy Embedding Model (e.g., BAAI/bge-small-en)
embedding_model = HuggingFaceModel(
    model_data="s3://my-bucket/models/bge-small-en.tar.gz",  # Custom model .tar.gz
    role=role,
    transformers_version="4.28.1",
    pytorch_version="2.0.0",
    entry_script="embedding_inference.py",  # Custom inference script
)

embedding_predictor = embedding_model.deploy(
    instance_type="ml.g5.2xlarge",
    initial_instance_count=1,
    endpoint_name="embedding-model"
)

In [None]:
# Inference Script (embedding_inference.py):
from transformers import AutoTokenizer, AutoModel
import torch

def model_fn(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModel.from_pretrained(model_dir)
    return {"tokenizer": tokenizer, "model": model}

def predict_fn(data, model_dict):
    inputs = model_dict["tokenizer"](data["text"], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model_dict["model"](**inputs)
    return outputs.last_hidden_state.mean(dim=1).tolist()  # Return embeddings

In [None]:
# (B) Deploy Generation Model

# Deploy Llama 2 (HuggingFace Hub)
generation_model = HuggingFaceModel(
    model_data="s3://my-bucket/models/llama-2-7b.tar.gz",
    role=role,
    transformers_version="4.30.2",
    pytorch_version="2.0.0",
    entry_script="generation_inference.py",
)

generation_predictor = generation_model.deploy(
    instance_type="ml.g5.4xlarge",  # Large GPU for LLM
    initial_instance_count=1,
    endpoint_name="generation-model"
)

In [None]:
# Inference Script (generation_inference.py)

from transformers import AutoTokenizer, AutoModelForCausalLM

def model_fn(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir)
    return {"tokenizer": tokenizer, "model": model}

def predict_fn(data, model_dict):
    inputs = model_dict["tokenizer"](data["prompt"], return_tensors="pt")
    outputs = model_dict["model"].generate(**inputs, max_new_tokens=100)
    return model_dict["tokenizer"].decode(outputs[0], skip_special_tokens=True)

In [None]:
# (C) Set Up Vector Database

# Use Pinecone (Serverless) or FAISS (Self-hosted)
import pinecone

pinecone.init(api_key="YOUR_API_KEY", environment="us-west1-gcp")
index = pinecone.Index("rag-index")

# Store embeddings (run once)
index.upsert([("doc1", [0.1, 0.2, ...]), ("doc2", [0.3, 0.4, ...])])

## 3. Orchestrating the RAG Pipeline

In [None]:
# Deploy a Lambda + API Gateway to connect all components
import boto3
import json

sagemaker = boto3.client("sagemaker-runtime")
pinecone = boto3.client("pinecone")

def lambda_handler(event, context):
    # Step 1: Get query embedding
    query = event["query"]
    embedding_response = sagemaker.invoke_endpoint(
        EndpointName="embedding-model",
        ContentType="application/json",
        Body=json.dumps({"text": query})
    )
    query_embedding = json.loads(embedding_response["Body"].read())

    # Step 2: Retrieve relevant docs from Vector DB
    retrieved_docs = pinecone.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True
    )

    # Step 3: Generate answer using LLM
    context = " ".join([doc["metadata"]["text"] for doc in retrieved_docs["matches"]])
    prompt = f"Answer based on: {context}\n\nQuestion: {query}\nAnswer:"

    generation_response = sagemaker.invoke_endpoint(
        EndpointName="generation-model",
        ContentType="application/json",
        Body=json.dumps({"prompt": prompt})
    )
    answer = json.loads(generation_response["Body"].read())

    return {"answer": answer}