# Retrievers

In [None]:
!pip install langchain langchain-community langchain_openai pdfplumber chromadb

In [None]:
!pip install pypdf

In [None]:
!pip install --quiet langchain-chroma

In [38]:
!pip install --upgrade --quiet  rank_bm25 > /dev/null

In [None]:
!pip install faiss-cpu

In [10]:
import re
import math
import numpy as np
from collections import Counter, defaultdict
from langchain_community.retrievers import BM25Retriever
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document

In [4]:
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_openai import ChatOpenAI
from langchain.vectorstores import Chroma
from google.colab import userdata
import pdfplumber
import uuid
import os

In [17]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [39]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [5]:
GIRUOPENAIKEY = userdata.get('GIRUOPENAIKEY')
os.environ['OPENAI_API_KEY'] = GIRUOPENAIKEY

In [7]:
# API key fetched internally, if not set in environment pass as an argument.
model = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

## Loading Data

In [13]:
loaders = [
    PyPDFLoader("/content/data/Understanding_Climate_Change.pdf"),
    PyPDFLoader("/content/data/What is finance.pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

len(docs), type(docs)

(49, list)

## Multi Vector Retriever  

Two major types -
- Parent - Child Retriever  
In this search performed over child and respective parent is fetched for large context.   
- Summary Based Retriever   
In this summary is matched first and then on basis of id-matches respective detailed smaller documents are fetched. It's used for fast searching with small context or accurate/exact matching.

Complete | Large Documents

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
len(docs), type(docs)

(49, list)

In [22]:
vars(docs[0]).keys()

dict_keys(['id', 'metadata', 'page_content', 'type'])

In [28]:
docs[0].page_content[:200]

'Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the plane'

In [18]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [29]:
len(doc_ids), type(doc_ids)

(49, list)

In [30]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=271)

In [31]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

Uploading / Upserting child documents to the vector store

In [None]:
retriever.vectorstore.add_documents(sub_docs)

Uploading large documents into doc store

In [33]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

If we perform similarity search on vector store it will return smaller chunks

In [34]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("Finance")[0]

Document(id='ba487ebd-8cad-4db2-9ce8-c68a69ee83ca', metadata={'doc_id': 'c093ede9-0a7e-4854-adce-e41283b65349', 'page': 1, 'source': '/content/data/What is finance.pdf'}, page_content='What is Finance?\n• "Finance" is a broad term that describes \ntwo related activities: the study of how \nmoney is managed and the actual process \nof acquiring needed funds. \n• Because individuals, businesses and \ngovernment entities all need funding to')

In [36]:
# Retriever returns larger chunks
len(retriever.invoke("Finance")[0].page_content)

395

In [37]:
retriever.invoke("Finance")[0].page_content[:500]

'What is Finance?\n• "Finance" is a broad term that describes \ntwo related activities: the study of how \nmoney is managed and the actual process \nof acquiring needed funds. \n• Because individuals, businesses and \ngovernment entities all need funding to \noperate, the field is often separated into \nthree sub-categories: personal finance, \ncorporate finance and public finance.\nwww.investopedia.com'

## Ensemble Retriever

In [40]:
loaders = [
    PyPDFLoader("/content/data/Understanding_Climate_Change.pdf"),
    PyPDFLoader("/content/data/What is finance.pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

len(docs), type(docs)

(49, list)

In [48]:
# first retriever
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2

In [49]:
res = bm25_retriever.invoke("What is Cimate?")
len(res)

2

In [50]:
res[0].page_content[:500]

'What is Finance?\n• "Finance" is a broad term that describes \ntwo related activities: the study of how \nmoney is managed and the actual process \nof acquiring needed funds. \n• Because individuals, businesses and \ngovernment entities all need funding to \noperate, the field is often separated into \nthree sub-categories: personal finance, \ncorporate finance and public finance.\nwww.investopedia.com'

*Successfully failed! As it completely ignored the climate word and returned doucments for 'What is'*

In [54]:
# second retriever
embedding = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_documents(docs, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

In [55]:
faiss_retriever.invoke("What is Cimate?")

[Document(id='f2c2bd2b-9270-43a0-b196-114740c76192', metadata={'source': '/content/data/Understanding_Climate_Change.pdf', 'page': 0}, page_content='Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. \nHistorical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n11,700 years ago marking the beginning of the modern climate era and human civilization. \nMost of these climate changes are attributed to very small variations i

*As FAISS matches over semantic meaning so it gives the correct output or relevant output*

In [56]:
ensemble_retriever_1 = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.3, 0.7]
)

In [57]:
ensemble_retriever_2 = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [58]:
ensemble_retriever_3 = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.8, 0.2]
)

In [62]:
print(f'First \n {ensemble_retriever_1.invoke("What is Cimate?")[0].page_content[:300]} \n')
print(f'Second \n {ensemble_retriever_2.invoke("What is Cimate?")[0].page_content[:300]} \n')
print(f'Third \n {ensemble_retriever_3.invoke("What is Cimate?")[0].page_content[:300]} \n')

First 
 Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an exte 

Second 
 What is Finance?
• "Finance" is a broad term that describes 
two related activities: the study of how 
money is managed and the actual process 
of acquiring needed funds. 
• Because individuals, businesses and 
government entities all need funding to 
operate, the field is often separated into 
thre 

Third 
 What is Finance?
• "Finance" is a broad term that describes 
two related activities: the study of how 
money is managed and the actual process 
of acquiring needed funds. 
• Because individuals, businesses and 
government entities all need funding to 
operate, the field is often separated into 
thre 

