# **Hybrid Search RAG** using Langchain and OpenAI

In [1]:
!pip install pypdf -q
!pip install langchain -q
!pip install langchain_community -q
!pip install langchain_openai -q
!pip install langchain_chroma -q
!pip install rank_bm25 -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.4/404.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import necessary libraries
import os
from google.colab import userdata

### Initialize OpenAI LLM

In [3]:
from langchain_openai import ChatOpenAI

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# Initialize the ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0
)

### Initialize Embedding Model

In [4]:
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

### Load PDF Document

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader=PyPDFLoader("/content/codeprolk.pdf")

docs=loader.load()

### Split Documents into Chunks

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=250,chunk_overlap=30)

chunks = splitter.split_documents(docs)

In [7]:
len(chunks)

33

### Create Semantic Search Retriever

In [8]:
from langchain_chroma import Chroma

vectorstore=Chroma.from_documents(chunks, embedding_model)

vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 2})

In [9]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7a3223e8d420>, search_kwargs={'k': 2})

### Create Keyword Search Retriever

In [10]:
from langchain.retrievers import BM25Retriever

keyword_retriever = BM25Retriever.from_documents(chunks)

keyword_retriever.k =  2

In [11]:
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7a321fb8a800>, k=2)

### Create Hybrid Search Retriever

In [12]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers = [vectorstore_retreiver, keyword_retriever], weights = [0.5, 0.5])

In [13]:
ensemble_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7a3223e8d420>, search_kwargs={'k': 2}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7a321fb8a800>, k=2)], weights=[0.5, 0.5])

### Define Prompt Template

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Define a message template for the chatbot
message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

# Create a chat prompt template from the message
prompt = ChatPromptTemplate.from_messages([("human", message)])

### Create RAG Chain with Hybrid Search

In [15]:
chain = (
    {
      "context": ensemble_retriever,
      "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

### Invoke RAG Chain with Example Questions

In [16]:
response = chain.invoke("what are the popular videos in codeprolk")

print(response.content)

The popular videos in CodePRO LK are those that have assisted learners in their learning journeys and have played a significant role in democratizing tech.


In [None]:
# keyword_retriever, vectorstore_retreiver, ensemble_retriever

In [17]:
for doc in keyword_retriever.invoke("what are the popular videos in codeprolk"):
  print(doc.page_content)
  print("---------------------")

appreciation and sharing how the videos have assisted them in their learning journ eys. 
Impact  
The CodePRO LK YouTube channel has played a significant role in democratizing tech
---------------------
industry, ensuring that learners are well -prepared for real -world challenges.  
Enhanced Learning Tools  
The platform plans to integrate more interactive and adaptive learning tools to personalize the
---------------------


In [18]:
for doc in vectorstore_retreiver.invoke("what are the popular videos in codeprolk"):
  print(doc.page_content)
  print("---------------------")

appreciation and sharing how the videos have assisted them in their learning journ eys. 
Impact  
The CodePRO LK YouTube channel has played a significant role in democratizing tech
---------------------
support each other. Additionally, the platform offers consultation services for personalized 
learning support.  
 
CodePRO LK YouTube Channel  
Overview  
The CodePRO LK YouTube Channel  is a crucial extension of the platform, providing a wealth
---------------------


In [19]:
for doc in ensemble_retriever.invoke("what are the popular videos in codeprolk"):
  print(doc.page_content)
  print("---------------------")

appreciation and sharing how the videos have assisted them in their learning journ eys. 
Impact  
The CodePRO LK YouTube channel has played a significant role in democratizing tech
---------------------
support each other. Additionally, the platform offers consultation services for personalized 
learning support.  
 
CodePRO LK YouTube Channel  
Overview  
The CodePRO LK YouTube Channel  is a crucial extension of the platform, providing a wealth
---------------------
industry, ensuring that learners are well -prepared for real -world challenges.  
Enhanced Learning Tools  
The platform plans to integrate more interactive and adaptive learning tools to personalize the
---------------------
