<a href="https://colab.research.google.com/github/edcalderin/llm-ml-experiments/blob/master/the_role_of_hybrid_search_in_rag_applications.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Role of Hybrid Search in RAG Applications

https://ai.plainenglish.io/the-role-of-hybrid-search-in-rag-applications-29bf46b95152

Reminder theory

* Hybrid search: Semantic and keyword search.
* Vector search represents documents as dense embeddings, whereas keyword search sparse vector embeddings.
* Keyword search is implemented with BM25 algorithm and relies on lexical matching.

## Modules

In [14]:
!pip install -qU langchain-openai \
                 langchain-community \
                 pypdf \
                 faiss-cpu \
                 rank_bm25

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

## Loading documents and splitting

In [15]:
from langchain import hub
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [16]:
pdf_url: str = "https://www.cgonzalez.org/app/download/24268786/Challenger+Esp.pdf"

loader = PyPDFLoader(pdf_url)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 10
)

chunks = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

llm = ChatOpenAI(model="gpt-4o")

## Retrievers

In [17]:
faiss_vectorstore = FAISS.from_documents(chunks, embeddings)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 3})

bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 3

ensembre_retriever = EnsembleRetriever(retrievers = [faiss_retriever, bm25_retriever], weights=[.5, .5])

In [34]:
query = "En que año se lanzo el transbordador?"

retriever_dict = {
    "FAISS Retriever": faiss_retriever.invoke(query),
    "BM25 Retriever": bm25_retriever.invoke(query),
    "Hybrid search": ensembre_retriever.invoke(query)
}

for name, docs in retriever_dict.items():
    print(name)
    print([doc.page_content.replace("/n", " ") for doc in docs], sep="\n\n")
    print("="*50)

FAISS Retriever
['Era el 12 de Abril de 1981. N ASA iba a iniciar una nueva era en el transporte espacial', 'Este primer vuelo estaba designado como “vuelo de prueba”, usaba el transbordador \nColumbia y se denominó STS-1.', 'Jarvis y la primera tripulante civil en un vuelo de NASA S. Christa McAuliffe. \nLanzamiento del Challenger (STS 51-L)']
BM25 Retriever
['experimentó un fallo estructural catastrófico que determinó la pérdida de la tripulación \ny la nave.  \nEn Madrid , ajenos a lo que había', 'En Madrid nos quedó la sensación de haber \nperdido a gente de los nuestros pues así \nconsiderábamos a los astronautas que', 'fallecer con el impacto en el océano que se calcula fue a 333 km/h.']
Hybrid search
['Era el 12 de Abril de 1981. N ASA iba a iniciar una nueva era en el transporte espacial', 'experimentó un fallo estructural catastrófico que determinó la pérdida de la tripulación \ny la nave.  \nEn Madrid , ajenos a lo que había', 'Este primer vuelo estaba designado como “vuelo d

## Chains for final response

In [46]:
prompt = hub.pull("rlm/rag-prompt")

def chain_builder(context):
    return (
        {
            "question": RunnablePassthrough(), "context": context
        }
        | prompt
        | llm
        | StrOutputParser()
    )

chain_dict = {
    "FAISS Chain": chain_builder(faiss_retriever),
    "BM25 Chain": chain_builder(bm25_retriever),
    "Hybrid Chain": chain_builder(ensembre_retriever)
}

for name, chain in chain_dict.items():
    print(name, chain.invoke(query), end=f"\n{'='*50}\n\n", sep="\n")



FAISS Chain
El transbordador fue lanzado por primera vez el 12 de abril de 1981.

BM25 Chain
No puedo determinar el año de lanzamiento del transbordador con la información proporcionada.

Hybrid Chain
El transbordador fue lanzado el 12 de abril de 1981.



## Using Pinecone

### Setup

In [48]:
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API_KEY')

In [69]:
!pip install -q langchain-pinecone pinecone pinecone-text

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 

In [87]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_core.documents import Document
from pinecone_text.hybrid import hybrid_convex_scale

In [65]:
index_name: str = "hybrid-search-test"

pc = Pinecone()

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

In [74]:
bm25_encoder = BM25Encoder().default()

texts = [doc.page_content for doc in chunks]

bm25_encoder.fit(texts)

# store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

  0%|          | 0/86 [00:00<?, ?it/s]

In [80]:
def get_relevant_documents(query, index):
    sparse_vec = bm25_encoder.encode_queries(query)
    dense_vec = embeddings.embed_query(query)
    #len(dense_vec), len(sparse_vec)
    dense_vec, sparse_vec = hybrid_convex_scale(dense_vec, sparse_vec, 0.4)
    sparse_vec["values"] = [float(s1) for s1 in sparse_vec["values"]]
    result=index.query(vector=dense_vec, sparse_vec=sparse_vec, top_k=4, include_metadata=True)
    final_result = []
    for res in result["matches"]:
        context = res["metadata"].pop("text")
        final_result.append(
            Document(page_content=context, metadata=res["metadata"])
        )
    return final_result


final_result = get_relevant_documents(query, index)

print(final_result)

[Document(metadata={'author': 'cgonzalez', 'comments': '', 'company': 'MDSCC', 'creationdate': '2016-03-06T12:45:34+01:00', 'creator': 'Acrobat PDFMaker 9.1 for Word', 'keywords': '', 'moddate': '2016-03-06T12:46:25+01:00', 'page': 0.0, 'page_label': '1', 'producer': 'Adobe PDF Library 9.0', 'source': 'https://www.cgonzalez.org/app/download/24268786/Challenger+Esp.pdf', 'sourcemodified': 'D:20160306114458', 'subject': '', 'title': '', 'total_pages': 5.0}, page_content='Era el 12 de Abril de 1981. N ASA iba a iniciar una nueva era en el transporte espacial'), Document(metadata={'author': 'cgonzalez', 'comments': '', 'company': 'MDSCC', 'creationdate': '2016-03-06T12:45:34+01:00', 'creator': 'Acrobat PDFMaker 9.1 for Word', 'keywords': '', 'moddate': '2016-03-06T12:46:25+01:00', 'page': 0.0, 'page_label': '1', 'producer': 'Adobe PDF Library 9.0', 'source': 'https://www.cgonzalez.org/app/download/24268786/Challenger+Esp.pdf', 'sourcemodified': 'D:20160306114458', 'subject': '', 'title': '

In [86]:
chain_builder(lambda _: final_result).invoke(query)

'El transbordador espacial fue lanzado el 12 de abril de 1981.'