In [3]:
# Loading the environment variables
import os
from dotenv import load_dotenv
load_dotenv("./.env")
# OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
# GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
HF_TOKEN = os.environ['HUGGING_FACE_TOKEN']

# Load the documents from local
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_dir="C:/Users/dhinesh.m/Documents/Practical_RAG/data" , required_exts=['.pdf']).load_data()

In [4]:
print("Length of documents", len(documents))

Length of documents 50


In [5]:
# Chunking the documents for embedding
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=512 , chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
len(nodes)

111

In [None]:
# !pip install llama-index-embeddings-google-genai

In [15]:
# Creating a faiss vector store with openai embedding
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.settings import Settings
import faiss

# embed_model = OpenAIEmbedding(model = 'text-embedding-ada-002', api_key = OPENAI_API_KEY)
# embed_model = GoogleGenAIEmbedding(model_name = 'text-embedding-004', api_key = GOOGLE_API_KEY)

embed_model = HuggingFaceEmbedding(model_name="nomic-ai/nomic-embed-text-v1",trust_remote_code=True)
Settings.embed_model = embed_model
embed_dimension= 768
faiss_index = faiss.IndexFlatL2(embed_dimension)

from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
llm = HuggingFaceInferenceAPI(model="mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)
Settings.llm = llm


<All keys matched successfully>


In [8]:
# Creating vector store and indexing the document chunks(nodes)
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore

vector_store = FaissVectorStore(faiss_index = faiss_index)
storage_context = StorageContext.from_defaults(vector_store = vector_store)
index = VectorStoreIndex(nodes = nodes, embed_model = embed_model, storage_context = storage_context, show_progress = True )

# Persist the index and vector store to disk
index.storage_context.persist(persist_dir="./storage")

Generating embeddings:   0%|          | 0/111 [00:00<?, ?it/s]

In [None]:
# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# # llm = Gemini(model="models/gemini-1.5-flash",api_key= GOOGLE_API_KEY)
# llm = HuggingFaceInferenceAPI(model="mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)

# query_engine = index.as_chat_engine(llm = llm, similarity_top_k=3)
# result = query_engine.query("what is transformer attention?")
# print(result.response)

In [16]:
# Loading the index from persisted directory
from llama_index.core import load_index_from_storage

vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

In [None]:
# !pip install llama-index-llms-google-genai

In [17]:
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(model = 'gpt-3.5-turbo', api_key = OPENAI_API_KEY)

# from llama_index.llms.gemini import Gemini
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# llm = Gemini(model="models/gemini-1.5-flash",api_key= GOOGLE_API_KEY)
llm = HuggingFaceInferenceAPI(model="mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)

# Creating a prompt template

from llama_index.core.prompts import PromptTemplate

# Define the prompt template
qa_prompt = PromptTemplate(
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, answer the question: {query_str}"
)


from llama_index.core import get_response_synthesizer

# Initialize the response synthesizer with the custom prompt
response_synthesizer = get_response_synthesizer(
    llm = llm,
    text_qa_template = qa_prompt,
    response_mode="compact"
)



In [18]:
# Create a retriever with similarity_top_k=5
retriever = index.as_retriever(similarity_top_k=3)


from llama_index.core.query_engine import RetrieverQueryEngine

# Initialize the query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)


In [27]:
result = query_engine.query("What are main components of a RAG model and how do they react ?")

print("Response: ", result.response, end="\n\n")
print("Source details: ", end="\n")
for src in result.source_nodes:
    print(src.node.metadata)
    print("Sample text: ", src.node.text[:100])
    print("")

Response:  The main components of a RAG model are a retriever and a generator. The retriever, denoted as p η(z |x ), is a component that uses the input sequence x to retrieve text documents z and returns distributions over text passages given a query x. The generator, denoted as p θ(y i |x,z,y 1:i −1), is a parametrized component that generates the target sequence y based on the input sequence x, the retrieved text documents z, and the previously generated sequence y 1:i −1.

The retriever component in a RAG model is non-parametric, meaning it does not have learnable parameters. Instead, it relies on a retrieval strategy to select relevant text documents from a large corpus. The generator component, on the other hand, is parametric and learns to generate sequences based on the input and context provided by the retriever.

In terms of how they react, the retriever component retrieves text documents based on the input query, while the generator component generates sequences based on the 

In [28]:
result = query_engine.query("What are two sub layers in each encoder block of the transformer model ?")

print("Response: ", result.response, end="\n\n")
print("Source details: ", end="\n")
for src in result.source_nodes:
    print(src.node.metadata)
    print("Sample text: ", src.node.text[:100])
    print("")

Response:  The two sub-layers in each encoder block of the Transformer model are a multi-head self-attention mechanism and a simple, position-wise fully connected feed-forward network.

Source details: 
{'page_label': '3', 'file_name': 'transformers.pdf', 'file_path': 'C:\\Users\\dhinesh.m\\Documents\\Practical_RAG\\data\\transformers.pdf', 'file_type': 'application/pdf', 'file_size': 2215244, 'creation_date': '2025-05-06', 'last_modified_date': '2025-05-06'}
Sample text:  Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture us

{'page_label': '5', 'file_name': 'transformers.pdf', 'file_path': 'C:\\Users\\dhinesh.m\\Documents\\Practical_RAG\\data\\transformers.pdf', 'file_type': 'application/pdf', 'file_size': 2215244, 'creation_date': '2025-05-06', 'last_modified_date': '2025-05-06'}
Sample text:  See Figure 2.
3.3 Position-wise Feed-Forward Networks
In addition to attention sub-layers, each of t

{'page_label': '5', 'file_name': 'transform

In [29]:
result = query_engine.query("Explain how positional encoding is implemented in transformer and why is it necessary?")

print("Response: ", result.response, end="\n\n")
print("Source details: ", end="\n")
for src in result.source_nodes:
    print(src.node.metadata)
    print("Sample text: ", src.node.text[:100])
    print("")

Response:  In the Transformer model, positional encoding is implemented to provide the model with information about the relative or absolute position of the tokens in the sequence, as the model itself does not have any recurrence or convolution to inherently understand the order of the sequence.

The positional encodings have the same dimension as the embeddings, allowing them to be summed with the input embeddings at the bottoms of the encoder and decoder stacks. In this work, the authors use sine and cosine functions of different frequencies for the positional encodings. Each dimension of the positional encoding corresponds to a sinusoid, and the wavelengths form a geometric progression from 2π to 10000 · 2π. This function is chosen because it allows the model to easily learn to attend by relative positions, as for any fixed offset k, the positional encoding at position pos+k can be represented as a linear function of the positional encoding at position pos.

Positional encoding is n

In [30]:
result = query_engine.query("Describe the concept of multi-head attention in transformer and why is it beneficial ?")

print("Response: ", result.response, end="\n\n")
print("Source details: ", end="\n")
for src in result.source_nodes:
    print(src.node.metadata)
    print("Sample text: ", src.node.text[:100])
    print("")

Response:  Multi-head attention in Transformer is a mechanism that allows the model to jointly attend to information from different representation subspaces at different positions. This is achieved by dividing the attention process into multiple parallel heads, each with a reduced dimension compared to the full dimension.

In the Transformer, there are h = 8 parallel attention layers, or heads. For each head, the queries (Q), keys (K), and values (V) are projected through separate parameter matrices (WQi, WKi, WVi) before being used in the attention calculation. The outputs of all heads are then concatenated and linearly transformed by another parameter matrix (WO) to produce the final output.

The benefit of using multi-head attention is that it allows the model to capture information from different perspectives simultaneously, reducing the risk of averaging inhibiting this as with a single attention head. By using multiple heads, the model can learn to attend to different aspects of 

In [31]:
result = query_engine.query("What is few-shot learning and how does GPT 3 implement it during inference ?")

print("Response: ", result.response, end="\n\n")
print("Source details: ", end="\n")
for src in result.source_nodes:
    print(src.node.metadata)
    print("Sample text: ", src.node.text[:100])
    print("")

Response:  Few-shot learning is a concept in machine learning where a model is trained on a small number of examples (few shots) and is then expected to generalize and perform well on new, unseen examples that are similar to the training data.

Regarding GPT-3, the context information does not provide specific details about its implementation of few-shot learning during inference. However, it is known that GPT-3 is a transformer-based model that uses a large-scale pretraining approach on a diverse range of text data. During inference, it can generate responses based on the input it receives, and it can leverage the knowledge it has learned during pretraining to perform well on new, unseen tasks with only a few examples (few-shot learning). This is due to the model's ability to capture patterns and relationships in the data, allowing it to generalize to new tasks.

Source details: 
{'page_label': '7', 'file_name': 'transformers.pdf', 'file_path': 'C:\\Users\\dhinesh.m\\Documents\\Practi