<a href="https://colab.research.google.com/github/geijinchan/Better-RAG-with-Merge-Retriever-LOTR-/blob/main/Better%20RAG%20(LOTR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Important Dependencies

In [4]:
!pip install chromadb langchain transformers sentence-transformers
!pip install -U langchain-community
!pip install pypdf



## Importing necessary dependencies

In [5]:
import chromadb, torch, os
from langchain.document_transformers import (
    LongContextReorder,
)
from langchain.chains import StuffDocumentsChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Chroma

## Getting the embedding model

In [7]:
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embedding': False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [8]:
print(f"Embedding model loaded \n{hf}")

Embedding model loaded 
client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='BAAI/bge-large-en' cache_folder=None model_kwargs={'device': 'cpu'} encode_kwargs={'normalize_embedding': False} query_instruction='Represent this question for searching relevant passages: ' embed_instruction='' show_progress=False


In [11]:
## Data Preprocessing

loader_un_sdg = PyPDFLoader("UN SDG.pdf")
documents_un_sdg = loader_un_sdg.load()
text_splitter_un_sdg = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts_un_sdg = text_splitter_un_sdg.split_documents(documents_un_sdg)

In [12]:
## Data Preprocessing

loader_paris_agreement = PyPDFLoader("english_paris_agreement.pdf")
documents_paris_agreement = loader_paris_agreement.load()
text_splitter_paris_agreement = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
texts_paris_agreement = text_splitter_paris_agreement.split_documents(documents_paris_agreement)

## Create and Store Vectors

In [13]:
un_sdg_store = Chroma.from_documents(texts_un_sdg, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/un_sdg_chroma_cosine")
paris_agreement_store = Chroma.from_documents(texts_paris_agreement, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/paris_chroma_cosine")

## Load the vector store

In [14]:
load_un_store = Chroma(persist_directory = "store/un_sdg_chroma_cosine",embedding_function=hf)
load_paris_store = Chroma(persist_directory = "store/paris_chroma_cosine",embedding_function=hf)

  load_un_store = Chroma(persist_directory = "store/un_sdg_chroma_cosine",embedding_function=hf)


## Init Mege Retriever and Perform Semantic Search

In [15]:
retriever_un_sdg = load_un_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})
retriever_paris_agreement = load_paris_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [18]:
lotr = MergerRetriever(retrievers=[retriever_un_sdg, retriever_paris_agreement])
query = "What does the goal 16 talks about"
docs = lotr.get_relevant_documents(query)
docs

  docs = lotr.get_relevant_documents(query)


[Document(metadata={'page': 29, 'source': 'UN SDG.pdf'}, page_content='!!Goal!16.!Promote!peaceful!and!inclusive!societies!for!sustainable!development,!provide!access!to!justice!for!all!and!build!effective,!accountable!and!inclusive!institutions!at!all!levels!+16.1+Significantly+reduce+all+forms+of+violence+and+related+death+rates+everywhere++16.2+End+ abuse,+ exploitation,+ trafficking+ and+ all+ forms+ of+ violence+ against+ and+ torture+ of+children++16.3+Promote+the+rule+of+law+at+the+national+and+international+levels+and+ensure+equal+access+to+justice+for+all++16.4+By+2030,+significantly+reduce+illicit+financial+and+arms+flows,+strengthen+the+recovery+and+return+of+stolen+assets+and+combat+all+forms+of+organized+crime++16.5+Substantially+reduce+corruption+and+bribery+in+all+their+forms++16.6+Develop+effective,+accountable+and+transparent+institutions+at+all+levels++16.7+Ensure+ responsive,+ inclusive,+ participatory+ and+ representative+ decisionJmaking+ at+'),
 Document(metadata=

In [16]:
## Long Context Reorder (Lost in the Middlee Problem Fix)

https://arxiv.org/abs/2307.03172

In [19]:
recording = LongContextReorder()

reordered_docs = recording.transform_documents(docs)

reordered_docs

[Document(metadata={'page': 7, 'source': 'english_paris_agreement.pdf'}, page_content='16. Parties, including regional economic integration organizations and their \nmember States, that have reached an agreement to act jointly under paragraph 2 of \nthis Article shall notify the secretariat of the terms of that agreement, including the \nemission level allocated to each Party within the relevant time period, when they \ncommunicate their nationally determined contributions. The secretariat shall in turn \ninform the Parties and signatories to the Convention of the terms of that agreement. \n17. Each party to such an agreement shall be responsible for its emission level as \nset out in the agreement referred to in paragraph 16 of this Article in accordance \nwith paragraphs 13 and 14 of this Article and Articles 13 and 15. \n18. If Parties acting jointly do so in the framework of, and together with, a \nregional economic integration organization which is itself a Party to this \nAgreeme