In [None]:
# Prompt -> LLM -> Response
# (New data/external data) + Prompt -> LLM -> Response
# How are we retrieving this information to add it to the prompt
# Copy paste new information
# Can programmatically read the new information
# Can scrape the information

### CONTEXT WINDOW ###
# chunk -> -> embeddings -> DB -> Retrieve appropriate information to add it in the prompt
# Appropriate -> data -> embeddings (models) -> DB -> perform query (similarity) -> get relevant content data
# appropriate + prompt -> LLM -> Response

# Document Loaders

## CSV Loader

In [None]:
import os

In [None]:
from langchain.document_loaders import CSVLoader

In [None]:
os.path.isdir("../datasets/sns_datasets")

In [None]:
loader = CSVLoader(file_path="../datasets/sns_datasets/titanic.csv")

In [None]:
data = loader.load()

In [None]:
type(data[0])

In [None]:
data[0].page_content

In [None]:
data[0].metadata

In [None]:
print(data[0].page_content)

In [None]:
loader = CSVLoader(file_path="../datasets/sns_datasets/titanic.csv", source_column='sex')

In [None]:
loader.load()

## HTML Loader

In [None]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [None]:
loader = UnstructuredHTMLLoader(file_path="../datasets/harry_potter_html/001.htm")

In [None]:
data = loader.load()

data

In [None]:
len(data)

In [None]:
print(data[0].page_content)

In [None]:
from langchain.document_loaders import BSHTMLLoader

In [None]:
loader = BSHTMLLoader(file_path="../datasets/harry_potter_html/001.htm")

data = loader.load()

In [None]:
len(data)

In [None]:
print(data[0].page_content)

## JSON Loader

In [None]:
import json

json_filepath = "../datasets/population_data.json"

with open(json_filepath) as f:
    loaded_json = json.loads(f.read())
    
len(loaded_json)

In [None]:
loaded_json

In [None]:
from langchain.document_loaders import JSONLoader

In [None]:
loader = JSONLoader(file_path=json_filepath, jq_schema="Value")

data = loader.load()

data

## Markdown Loader

In [None]:
from langchain.document_loaders import UnstructuredMarkdownLoader

In [None]:
md_filepath = "../datasets/harry_potter_md/001.md"

os.path.isfile(md_filepath)

In [None]:
loader = UnstructuredMarkdownLoader(file_path=md_filepath)

data = loader.load()

data

In [None]:
len(data)

In [None]:
print(data[0].page_content)

## PDF Loader

In [None]:
from langchain.document_loaders import PyPDFLoader

pdf_filepath = "../datasets/harry_potter_pdf/hpmor-trade-classic.pdf"

loader = PyPDFLoader(file_path=pdf_filepath)

data = loader.load()

data

In [None]:
len(data)

In [None]:
print(data[1].page_content)

In [None]:
data[0].metadata

## Integrations

In [None]:
# Wikipedia

In [None]:
from langchain.document_loaders import WikipediaLoader

In [None]:
loader = WikipediaLoader(query='India', load_max_docs=2)

In [None]:
data = loader.load()

In [None]:
print(data[0].page_content)

In [None]:
data[0].metadata

## ArXiv

In [None]:
from langchain_community.document_loaders import ArxivLoader

In [None]:
loader = ArxivLoader(query='2201.03916', load_max_docs=1)

data = loader.load()

In [None]:
len(data)

In [None]:
print(data[0].page_content)

In [None]:
data[0].metadata

In [None]:
# Loading the chat model

import os
from langchain_openai import ChatOpenAI
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Ensure OPENAI_API_KEY is set in your .env file
if 'OPENAI_API_KEY' not in os.environ:
    raise ValueError("Please set OPENAI_API_KEY in your .env file")

chat = ChatOpenAI()
set_llm_cache(InMemoryCache())

In [None]:
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

system_template = "You are a Peer Reviewer"
human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
prompt = chat_prompt.format_prompt(title=data[0].metadata['Title'], content=data[0].metadata['Summary'])

response = chat(messages = prompt.to_messages())

print(response.content)

In [None]:
def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']
    
    summary_list = []
    for record in data:
        summary_list.append(record.metadata['Summary'])
    full_summary = "\n\n".join(summary_list)
    
    system_template = "You are a Peer Reviewer"
    human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

    systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
    prompt = chat_prompt.format_prompt(title=title, content=page_content)

    response = chat(messages = prompt.to_messages())

    return response.content

In [None]:
print(peer_review('1706.03762'))

In [None]:
print(peer_review('2201.03514'))

In [None]:
# Create a bot that can answer questions based on wikipedia articles

# Text Splitter

In [None]:
filepath = "../datasets/Harry Potter 1 - Sorcerer's Stone.txt"

with open(filepath, 'r') as f:
    hp_book = f.read()
    
print(hp_book)

In [None]:
len(hp_book)

In [None]:
len(hp_book.split())

In [None]:
len(hp_book.split("\n"))

In [None]:
len(hp_book.split("\n\n"))

In [None]:
from collections import Counter

In [None]:
line_len_list = []

for line in hp_book.split('\n\n'):
    curr_line_len = len(line)
    line_len_list.append(curr_line_len)

Counter(line_len_list)

In [None]:
# Character level splitting

from langchain.text_splitter import CharacterTextSplitter

In [None]:
def len_func(text):
    return len(text)

In [None]:
(100 + 100 + 900) + 300

In [None]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1200,
    chunk_overlap=100,
    length_function=len_func,
    is_separator_regex=False
)

In [None]:
para_list = text_splitter.create_documents(texts=[hp_book])

para_list

In [None]:
manual_character_split_chunks = []

for para in hp_book.split("\n\n"):
    manual_character_split_chunks.append(para)
    
len(manual_character_split_chunks)

In [None]:
langchain_character_split_chunks = []

for para in para_list:
    langchain_character_split_chunks.append(para)
    
len(langchain_character_split_chunks)

In [None]:
first_chunk = para_list[0]

In [None]:
first_chunk.metadata = {"source": filepath}

In [None]:
first_chunk.metadata

In [None]:
res_para_list = []

cnt = 0
for para in para_list:
    para.metadata = {"source": filepath, "chunk_number": cnt}
    cnt += 1
    res_para_list.append(para)

In [None]:
res_para_list[100].metadata

In [None]:
extra_line = " ".join(['word']*500)

len(text_splitter.create_documents(texts = [extra_line + hp_book])[0].page_content)

In [None]:
text_splitter.create_documents(texts = [extra_line + hp_book])[0]

## Recursive Character Splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ' '],
    chunk_size = 200,
    chunk_overlap = 100,
    length_function = len_func,
    is_separator_regex=False
)

In [None]:
print("\n\n".join(["\n".join([" ".join(['word']*100)]*20)]*10))

In [None]:
chunk_list = text_splitter.create_documents(texts = [extra_line + hp_book])

chunk_list

## Split by tokens

In [None]:
# !pip install tiktoken

In [None]:
sample_sent = "This is a sample sentence for you to tell me how the tokens are split in this sentence"

sample_sent.split(" ")

["Thi", "s", "is", "sample"]

In [None]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n\n",
    chunk_size = 1200,
    chunk_overlap=100,
    model_name = "text-embedding-3-small",
    encoding_name= "text-embedding-3-small"
)

In [None]:
doc_list = text_splitter.create_documents([hp_book])

doc_list

In [None]:
[len(doc.page_content) for doc in doc_list]

In [None]:
doc_list = text_splitter.split_text(hp_book)

doc_list

In [None]:
from langchain.docstore.document import Document

res_doc_list = []

for doc_txt in doc_list:
    curr_doc = Document(page_content=doc_txt, metadata={"source": filepath})
    res_doc_list.append(curr_doc)
    
res_doc_list

In [None]:
python_code = """def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']
    
    summary_list = []
    for record in data:
        summary_list.append(record.metadata['Summary'])
    full_summary = "\n\n".join(summary_list)
    
    system_template = "You are a Peer Reviewer"
    human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

    systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
    prompt = chat_prompt.format_prompt(title=title, content=page_content)

    response = chat(messages = prompt.to_messages())

    return response.content"""

In [None]:
# code splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=50,
    chunk_overlap=10
)

In [None]:
text_splitter.create_documents(texts = [python_code])

## Embeddings

In [None]:
import numpy as np

In [None]:
# Let's start with OpenAI models

import os
from langchain.embeddings import OpenAIEmbeddings

In [None]:
# Load environment variables (should already be loaded from previous cells)
# Ensure OPENAI_API_KEY is set in your .env file
if 'OPENAI_API_KEY' not in os.environ:
    from dotenv import load_dotenv
    load_dotenv()
    if 'OPENAI_API_KEY' not in os.environ:
        raise ValueError("Please set OPENAI_API_KEY in your .env file")

In [None]:
embedding_function = OpenAIEmbeddings()

In [None]:
text = "The scar had not pained Harry for nineteen years. All was well"

embedded_text = embedding_function.embed_query(text)

In [None]:
np.array(embedded_text).shape

In [None]:
from langchain.docstore.document import Document

doc_lines = [
    Document(page_content="It is our choices, Harry, that show what we truly are, far more than our abilities", metadata = {"source": "Harry Potter"}),
    Document(page_content=text, metadata = {"source": "Harry Potter"}),
]

doc_lines

In [None]:
# Extract the page_content

line_list = [doc.page_content for doc in doc_lines]

line_list

In [None]:
embedded_docs = [embedding_function.embed_query(line) for line in line_list]

np.array(embedded_docs).shape

In [None]:
embedded_docs = embedding_function.embed_documents(line_list)

np.array(embedded_docs).shape

In [None]:
# MTEB leaderboard

In [None]:
!pip install sentence_transformers

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [None]:
bge_embed_record = embedding_function.embed_query("This is some random text")
bge_embed_records = embedding_function.embed_documents(["This is some random text"])

print(np.array(bge_embed_record).shape)
print(np.array(bge_embed_records).shape)

In [None]:
from langchain_community.embeddings import FakeEmbeddings

embedding_function = FakeEmbeddings(size=300)

fake_embed_record = embedding_function.embed_query("This is some random text")
fake_embed_records = embedding_function.embed_documents(["This is some random text"])

In [None]:
np.array(fake_embed_record).shape

In [None]:
np.array(fake_embed_records).shape

# Vectorstores

In [None]:
!pip install "chromadb==0.4.24" "faiss-cpu==1.8.0"

In [None]:
!pip show chromadb

In [None]:
!pip show faiss-cpu

In [None]:
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings, FakeEmbeddings
from langchain_community.embeddings import FakeEmbeddings

In [None]:
loader = WikipediaLoader(query='Elon Musk', load_max_docs=5)
documents = loader.load()
documents

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
docs = text_splitter.split_documents(documents=documents)
print(len(docs))
docs

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

######### 

# embedding_function = FakeEmbeddings(size=300)

In [None]:
query = "Who is Elon Musk's Father?"

In [None]:
# Process
# Query -> Query Embeddings
# Chunks -> Chunk Embeddings -> Vectorstore
# Query Embeddings and Chunk Embeddings will be matched to get the results

In [None]:
# FAISS (in memory database)

from langchain.vectorstores import FAISS

In [None]:
db = FAISS.from_documents(docs, embedding_function)

# 'document in text' - embeddings
# Query -> query embeddings -> match with the embeddings in the vector store -> return the text connected to those embeddings

In [None]:
# Querying

matched_docs = db.similarity_search(query=query, k=5)

matched_docs

In [None]:
["errol musk" in doc.page_content.lower() for doc in matched_docs]

In [None]:
from langchain.vectorstores import Chroma

In [None]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/elon_musk_db")

In [None]:
# Loading the existing database

loaded_db = Chroma(persist_directory="../output/elon_musk_db", embedding_function=embedding_function)

In [None]:
# Query

print(query)

matched_docs = db.similarity_search(query=query, k=3)

matched_docs

In [None]:
# Adding the family information

family_data_loader = WikipediaLoader(query="Musk Family", load_max_docs=1)
family_documents = family_data_loader.load()
family_documents

In [None]:
family_docs = text_splitter.split_documents(documents=family_documents)
print(len(family_docs))
family_docs

In [None]:
# Adding new information

db = Chroma.from_documents(family_docs, embedding_function, persist_directory="../output/elon_musk_db")

In [None]:
matched_docs = db.similarity_search(query=query, k=4)

matched_docs

In [None]:
# Deleting the information
# Updating the information

# Retrievers

In [None]:
retriever = db.as_retriever()

retriever

In [None]:
matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

In [None]:
# How these retrievers should retreiver, how many items to retriever
# MMR - Maximum marginal relevance

retriever = db.as_retriever(search_type="mmr", search_kwargs = {"k": 5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs = {"score_threshold": 0.5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

In [None]:
db._collection.delete(ids=["1"])

In [None]:
len(docs)

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs = {"score_threshold": 0.5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

In [None]:
docs

# Other Retrievers

In [None]:
import chromadb
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

chunk_size = 400
chunk_overlap = 100

# Loading the environment variables
load_dotenv()

# Loading the chat model
chat = ChatOpenAI()

# Loading data
loader = WikipediaLoader(query='Steve Jobs', load_max_docs=5)
documents = loader.load()

# Text splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents=documents)

# Embedding function
embedding_function = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={"normalize_embeddings": True}
)

# Vector store
db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/steve_jobs_db")

retriever = db.as_retriever()

query = "When was Steve Jobs fired from Apple?"

# BM25 Retriever

In [None]:
# !pip install rank_bm25

In [None]:
from langchain.retrievers import BM25Retriever

In [None]:
bm25_retriever = BM25Retriever.from_documents(docs)

In [None]:
matched_docs = bm25_retriever.get_relevant_documents("Musk")

In [None]:
matched_docs

# Semantic Retrievers

In [None]:
# !pip install python-dotenv

In [None]:
# chunk_size = 400
# chunk_overlap = 100

# # Loading the environment variables
# load_dotenv()

# # Loading the chat model
# chat = ChatOpenAI()

# # Loading data
# loader = WikipediaLoader(query='Steve Jobs', load_max_docs=5)
# documents = loader.load()

# # Text splitting
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# docs = text_splitter.split_documents(documents=documents)

# # Embedding function
# embedding_function = HuggingFaceBgeEmbeddings(
#     model_name="BAAI/bge-large-en-v1.5",
#     model_kwargs={'device': 'cpu'},
#     encode_kwargs={"normalize_embeddings": True}
# )

# # Vector store
# db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/steve_jobs_db")

In [None]:
retriever = db.as_retriever()

In [None]:
query = "When was Steve Jobs fired from Apple?"

## MultiQuery Retriever

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
mq_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=chat)

In [None]:
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [None]:
mq_retriever.get_relevant_documents(query=query)

In [None]:
print(*["1. What year did Apple terminate Steve Jobs' employment?", '2. At what point in time was Steve Jobs ousted from his position at Apple?',
      '3. When did Steve Jobs experience his departure from Apple through termination?'], sep='\n')

In [None]:
retrieved_docs = mq_retriever.get_relevant_documents(query=query)

In [None]:
['1985' in doc.page_content for doc in retrieved_docs]

In [None]:
print(retrieved_docs[1].page_content)

## Contextual Compression

In [None]:
# Retrieval

# Query -> get the responses

# Query + responses -> LLM

# Extract the relevant from the responses

In [None]:
db = Chroma(persist_directory="../output/steve_jobs.db", embedding_function=embedding_function)

In [None]:
sim_docs = retriever.get_relevant_documents(query=query)

In [None]:
sim_docs

In [None]:
chat = ChatOpenAI(temperature=0)

In [None]:
# Document compressor

from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm=chat)

compressor

In [None]:
print(compressor.llm_chain.prompt.template)

In [None]:
# Compression Retriever

from langchain.retrievers import ContextualCompressionRetriever

compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compression_retriever

In [None]:
matched_docs = compression_retriever.get_relevant_documents(query=query)

In [None]:
matched_docs

In [None]:
[len(doc.page_content) for doc in matched_docs]

## Parent Document Retriever

In [None]:
# Split paragraph
# split sentence
# match sentences with query
# get the paragraph with most matching sentences.

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [None]:
parent_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000, chunk_overlap=100)
child_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=50)

store = InMemoryStore() # parent documents

In [None]:
par_doc_retriever = ParentDocumentRetriever(vectorstore=db, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter)

In [None]:
par_doc_retriever.add_documents(docs)

In [None]:
par_doc_retriever.get_relevant_documents(query=query)

## Time-Weighted Vector Store Retriever

In [None]:
# matching_score = cosine_similarity + (1-decay_rate)^hours_passed

0.9 + (0.1**5)

In [None]:
0.9 + 0.1**2

In [None]:
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.retrievers import TimeWeightedVectorStoreRetriever

emb_size = 1024
index = faiss.IndexFlatL2(emb_size)
temp_db = FAISS(embedding_function, index, docstore=InMemoryDocstore({}), index_to_docstore_id={})

tw_retriever = TimeWeightedVectorStoreRetriever(vectorstore=temp_db, decay_rate=1/1000000, k=1)

In [None]:
tw_retriever

In [None]:
from datetime import datetime, timedelta
from langchain_core.documents import Document

five_hours_ago = datetime.now() - timedelta(hours=5)

tw_retriever.add_documents(
    [Document(page_content="What is John doing?", metadata={"last_accessed_at": five_hours_ago})]
)

tw_retriever.add_documents([Document(page_content="What is Jack doing?")])

In [None]:
tw_retriever.get_relevant_documents("What are you doing?")

## Hypothetical Document Retreiver

In [None]:
# VS -> documents (answers)
# query -> query

# matching -> query & answers
# query + LLM -> hypothetical answer
# matching -> hypothetical with actual answers

In [None]:
# Question -> Embeddings
# documents -> Embeddings
# Match

# Question -> Answer with LLM -> Embeddings
# documents -> Embeddings
# Match

In [None]:
# Question -> Quesstion embeddings

# docs -> document emb [1, 2, 3]


In [None]:
from langchain.prompts.chat import SystemMessagePromptTemplate, ChatPromptTemplate

def get_hypo_doc(query):
    template = """Imagine you are an expert writing a detailed explanation on the topic: '{query}'
    Your response should be comprehensive and include key points that would be found in a top search result."""
    
    systemp_message_prompt = SystemMessagePromptTemplate.from_template(template=template)
    
    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt])
    
    messages = chat_prompt.format_prompt(query=query).to_messages()
    
    response = chat(messages=messages)
    
    hypo_doc = response.content
    
    return hypo_doc

In [None]:
get_hypo_doc(query)

In [None]:
base_retriever = db.as_retriever(search_kwargs={"k": 1})

matched_docs = base_retriever.get_relevant_documents(query=get_hypo_doc(query))

matched_docs

In [None]:
from langchain.chains import HypotheticalDocumentEmbedder

In [None]:
hyde_embedding_function = HypotheticalDocumentEmbedder.from_llm(llm=chat, base_embeddings=embedding_function, prompt_key='web_search')

# web_search, sci_fact, arguana, trec_covid, fiqa, dbpedia, trec_news, mr_tydi

In [None]:
doc_db = Chroma.from_documents(docs, hyde_embedding_function, persist_directory="../output/steve_jobs_hyde")

In [None]:
matched_docs_new = doc_db.similarity_search(query)

matched_docs_new

In [None]:
matched_docs[0].page_content == matched_docs_new[0].page_content

## Ensemble Retriever

Retriver1 (0.6) -> docA, docC, docH
Retriver2 (0.4) -> docG, docY, docA

score of docA: 1/1 + 1/3 -> 4/3
score of docA: (1/1)*0.6 + (1/3)*0.4 -> x

In [None]:
# Reciprocal Rank Fusion

from langchain.retrievers import EnsembleRetriever

In [None]:
bm25_retriever

In [None]:
par_doc_retriever

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, par_doc_retriever], weights=[0.5, 0.5])

In [None]:
hybrid_matched_docs = ensemble_retriever.get_relevant_documents(query=query)

In [None]:
hybrid_matched_docs

# Filters

## Embedding Redundant Filter

In [None]:
from langchain.document_transformers import EmbeddingsRedundantFilter

In [None]:
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_function)

In [None]:
redundant_filter.transform_documents(hybrid_matched_docs)

## Embeddings Filter

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings = embedding_function)

embeddings_filter.compress_documents(docs, query=query)

# Reordering

## Long Context Reorder

Important docs will be moved to beginning and the end

In [None]:
from langchain_community.document_transformers import LongContextReorder

In [None]:
reorder = LongContextReorder()

reordered_docs = reorder.transform_documents(hybrid_matched_docs)

reordered_docs

## RAG Pipelines

In [None]:
# Chat Model

from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.cache import InMemoryCache
from langchain.globals import set_llm_cache

load_dotenv()

chat = ChatOpenAI()
set_llm_cache(InMemoryCache())

In [None]:
# Data Loader

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path="../datasets/udhr_booklet_en_web.pdf")

documents = loader.load()

documents

In [None]:
# Text Splitting

from langchain.text_splitter import RecursiveCharacterTextSplitter


chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents=documents)

len(docs)

In [None]:
# Embedding Function

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [None]:
# Vector store
from langchain.vectorstores import Chroma

db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/human_rights")

In [None]:
query = "How does the declaration address the discrimination?"

## Exercise 1

In [None]:
import warnings

warnings.filterwarnings('ignore')

import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [None]:
# Contextual Compression + Multi-query retriever

In [None]:
# Compressor

from langchain.retrievers.document_compressors import LLMChainExtractor

base_compressor = LLMChainExtractor.from_llm(llm=chat)

In [None]:
# Multi-Query Retriever

from langchain.retrievers.multi_query import MultiQueryRetriever

base_retriever = db.as_retriever()
mq_retriever = MultiQueryRetriever.from_llm(retriever=base_retriever, llm=chat)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever

mq_compression_retriever = ContextualCompressionRetriever(base_compressor=base_compressor, base_retriever=mq_retriever)

In [None]:
matched_docs = mq_compression_retriever.get_relevant_documents(query=query)
matched_docs

In [None]:
matched_content = ""

for doc in matched_docs:
    page_content = doc.page_content
    matched_content += page_content
    matched_content += "\n\n"
    
print(matched_content)

In [None]:
# Augmentation

from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate

template = """Answer the following the question only by using the content given below in the triple backticks, do not use any other information to answer the question. If you can't answer the given question with the given context, you can return 'NO_OUTPUT' as a string.

Context: ```{context}```
-------------------------------
Question: {query}
-------------------------------
Answer: """

human_message_prompt = HumanMessagePromptTemplate.from_template(template=template)
chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
prompt = chat_prompt.format_prompt(query=query, context=matched_content)
messages = prompt.to_messages()
messages

In [None]:
# Generation
response = chat(messages=messages).content

print(response)

## Exercise 2

In [None]:
# Compressor -> HyDE + redundant filter + reordering
# Retriever -> Ensemble Retriever (Multi-query retriever, Tfidf, Parent Document)

In [None]:
# Compressor

from langchain.chains import HypotheticalDocumentEmbedder
from langchain.document_transformers import EmbeddingsRedundantFilter, LongContextReorder
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

hyde_embedding_function = HypotheticalDocumentEmbedder.from_llm(llm=chat, base_embeddings=embedding_function, prompt_key='web_search')
redundant_filter = EmbeddingsRedundantFilter(embeddings=hyde_embedding_function)
lcr = LongContextReorder()

compression_pipeline = DocumentCompressorPipeline(transformers = [redundant_filter, lcr])

compression_pipeline

In [None]:
# Retrievers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.retrievers import TFIDFRetriever, MultiQueryRetriever, ParentDocumentRetriever, EnsembleRetriever, ContextualCompressionRetriever

## TFIDF
tfidf_retriever = TFIDFRetriever.from_documents(docs)

## Multi-Query
mq_retriever = MultiQueryRetriever.from_llm(retriever=base_retriever, llm=chat)

## Parent document Retriever
parent_splitter= RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
child_splitter= RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
store = InMemoryStore()

### Creating an instance of parent-document retriever
par_doc_retriever = ParentDocumentRetriever(
    vectorstore=db,
    docstore=store, 
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

par_doc_retriever.add_documents(docs)

In [None]:
# Emsemble Retriever
from langchain.retrievers import EnsembleRetriever

retriever_pipeline = EnsembleRetriever(retrievers = [tfidf_retriever, mq_retriever, par_doc_retriever], weights=[0.4, 0.3, 0.3])

compression_retriever = ContextualCompressionRetriever(base_compressor=compression_pipeline, base_retriever=retriever_pipeline)

matching_docs = compression_retriever.get_relevant_documents(query=query)
matching_docs

In [None]:
# Retrieval QA Chain

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type='stuff',
    retriever=compression_retriever,
    return_source_documents=True
)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt)

In [None]:
response = qa_chain(query)
response

In [None]:
print(response['result'])

In [None]:
# TASK

# Private GPT