In [7]:
#Install required packages
!pip install langchain numpy python-dotenv rank-bm25
!pip install pypdf
!pip install PyMuPDF
!pip install python-dotenv
!pip install langchain-community
!pip install sentence-transformers
!pip install langchain-google-genai
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [1]:
#Clone the repository to access helper functions and evaluation modules
!git clone https://github.com/databytobi/RAG_TECHNIQUES.git
import sys
sys.path.append('RAG_TECHNIQUES')
# If you need to run with the latest data
# !cp -r RAG_TECHNIQUES/data .

Cloning into 'RAG_TECHNIQUES'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 68 (delta 19), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (68/68), 10.02 MiB | 15.98 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [2]:
import os
import sys
from dotenv import load_dotenv
from langchain.docstore.document import Document

from typing import List
from rank_bm25 import BM25Okapi
import numpy as np

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable (comment out if not using GOOGLE)
if not os.getenv('GOOGLE_API_KEY'):
    os.environ["GOOGLE_API_KEY"] = input("Please enter your GOOGLE API key: ")
else:
    os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')

# Original path append replaced for Colab compatibility
from helper_functions import *
#from evaluation.evalute_rag import *


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from helper_functions import *


Define document path

In [3]:
#Download required data files
import os
os.makedirs('data', exist_ok=True)

# Download the PDF document used in this notebook
!wget -O data/Agents_v8.pdf https://raw.githubusercontent.com/databytobi/RAG_TECHNIQUES/main/data/Agents_v8.pdf
!wget -O data/Agents_v8.pdf https://raw.githubusercontent.com/databytobi/RAG_TECHNIQUES/main/data/Agents_v8.pdf

--2025-08-05 12:55:00--  https://raw.githubusercontent.com/databytobi/RAG_TECHNIQUES/main/data/Agents_v8.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9305713 (8.9M) [application/octet-stream]
Saving to: ‘data/Agents_v8.pdf’


2025-08-05 12:55:00 (88.6 MB/s) - ‘data/Agents_v8.pdf’ saved [9305713/9305713]

--2025-08-05 12:55:00--  https://raw.githubusercontent.com/databytobi/RAG_TECHNIQUES/main/data/Agents_v8.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9305713 (8.9M) [application/octet-stream]
Saving to: ‘data/Agents_

In [4]:
path = "data/Agents_v8.pdf"

Encode the pdf to vector store and return split document from the step before, to create BM25 instance

In [5]:
def encode_pdf_and_get_split_documents(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embeddings = HuggingFaceEmbeddings()
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore, cleaned_texts

Create vectorstore and get the chunked documents

In [8]:
vectorstore, cleaned_texts = encode_pdf_and_get_split_documents(path)

  embeddings = HuggingFaceEmbeddings()


Create a bm25 index for retrieving documents by keywords

In [9]:
def create_bm25_index(documents: List[Document]) -> BM25Okapi:
    """
    Create a BM25 index from the given documents.

    BM25 (Best Matching 25) is a ranking function used in information retrieval.
    It's based on the probabilistic retrieval framework and is an improvement over TF-IDF.

    Args:
    documents (List[Document]): List of documents to index.

    Returns:
    BM25Okapi: An index that can be used for BM25 scoring.
    """
    # Tokenize each document by splitting on whitespace
    # This is a simple approach and could be improved with more sophisticated tokenization
    tokenized_docs = [doc.page_content.split() for doc in documents]
    return BM25Okapi(tokenized_docs)

In [10]:
bm25 = create_bm25_index(cleaned_texts) # Create BM25 index from the cleaned texts (chunks)

Define a function that retrieves both semantically and by keyword, normalizes the scores and gets the top k documents

In [11]:
def fusion_retrieval(vectorstore, bm25, query: str, k: int = 5, alpha: float = 0.5) -> List[Document]:
    """
    Perform fusion retrieval combining keyword-based (BM25) and vector-based search.

    Args:
    vectorstore (VectorStore): The vectorstore containing the documents.
    bm25 (BM25Okapi): Pre-computed BM25 index.
    query (str): The query string.
    k (int): The number of documents to retrieve.
    alpha (float): The weight for vector search scores (1-alpha will be the weight for BM25 scores).

    Returns:
    List[Document]: The top k documents based on the combined scores.
    """

    epsilon = 1e-8

    # Step 1: Get all documents from the vectorstore
    all_docs = vectorstore.similarity_search("", k=vectorstore.index.ntotal)

    # Step 2: Perform BM25 search
    bm25_scores = bm25.get_scores(query.split())

    # Step 3: Perform vector search
    vector_results = vectorstore.similarity_search_with_score(query, k=len(all_docs))

    # Step 4: Normalize scores
    vector_scores = np.array([score for _, score in vector_results])
    vector_scores = 1 - (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon)

    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) -  np.min(bm25_scores) + epsilon)

    # Step 5: Combine scores
    combined_scores = alpha * vector_scores + (1 - alpha) * bm25_scores

    # Step 6: Rank documents
    sorted_indices = np.argsort(combined_scores)[::-1]

    # Step 7: Return top k documents
    return [all_docs[i] for i in sorted_indices[:k]]

Use case

In [12]:
#Query
query = "What an agent?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)
docs_content = [doc.page_content for doc in top_docs]
show_context(docs_content)

Context 1:
Agents
37
February 2025
Who did the Texas Longhorns play in football last week? What is the address 
of the other team's stadium?
Tool Calls: search
Args:
 query: Texas Longhorns football schedule
Name: search
{...Results: "NCAA Division I Football, Georgia, Date..."}
The Texas Longhorns played the Georgia Bulldogs last week.
Tool Calls: places
Args:
 query: Georgia Bulldogs stadium
Name: places
{...Sanford Stadium Address: 100 Sanford...}
The address of the Georgia Bulldogs stadium is 100 Sanford Dr, Athens, GA


Context 2:
Agents
2
February 2025
Acknowledgements
Content contributors
Evan Huang
Emily Xue
Olcan Sercinoglu
Sebastian Riedel
Satinder Baveja
Antonio Gulli
Anant Nawalgaria
Curators and Editors
Antonio Gulli
Anant Nawalgaria
Grace Mollison 
Technical Writer
Joey Haymaker
Designer
Michael Lanning


Context 3:
Agents
36
February 2025
Python
from langgraph.prebuilt import create_react_agent
from langchain_core.tools import tool
from langchain_community.utilities impo

In [13]:
#Query
query = "usin Cognitive architectures: can you explain How agents operate ?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)
docs_content = [doc.page_content for doc in top_docs]
show_context(docs_content)

Context 1:
Agents
37
February 2025
Who did the Texas Longhorns play in football last week? What is the address 
of the other team's stadium?
Tool Calls: search
Args:
 query: Texas Longhorns football schedule
Name: search
{...Results: "NCAA Division I Football, Georgia, Date..."}
The Texas Longhorns played the Georgia Bulldogs last week.
Tool Calls: places
Args:
 query: Georgia Bulldogs stadium
Name: places
{...Sanford Stadium Address: 100 Sanford...}
The address of the Georgia Bulldogs stadium is 100 Sanford Dr, Athens, GA


Context 2:
Agents
9
February 2025
• They gather information, like the patron’s order and what ingredients are in the pantry 
and refrigerator.
• They perform some internal reasoning about what dishes and flavor profiles they can 
create based on the information they have just gathered.
• They take action to create the dish: chopping vegetables, blending spices, searing meat.
At each stage in the process the chef makes adjustments as needed, refining their plan as 


In [14]:



#Query
query = "can you explain how to Enhancing model performance with targeted learning?"


# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)
docs_content = [doc.page_content for doc in top_docs]
show_context(docs_content)

Context 1:
Agents
37
February 2025
Who did the Texas Longhorns play in football last week? What is the address 
of the other team's stadium?
Tool Calls: search
Args:
 query: Texas Longhorns football schedule
Name: search
{...Results: "NCAA Division I Football, Georgia, Date..."}
The Texas Longhorns played the Georgia Bulldogs last week.
Tool Calls: places
Args:
 query: Georgia Bulldogs stadium
Name: places
{...Sanford Stadium Address: 100 Sanford...}
The address of the Georgia Bulldogs stadium is 100 Sanford Dr, Athens, GA


Context 2:
using a matching algorithm like SCaNN
3. The matched content is retrieved from the vector database in text format and sent back to 
the agent
4. The agent receives both the user query and retrieved content, then formulates a response 
or action


Context 3:
Agents
22
February 2025
function_call {
  name: "display_cities"
  args: {
    "cities": ["Crested Butte", "Whistler", "Zermatt"],
    "preferences": "skiing"
    }
}
Snippet 5. Sample Function Call p