# January 31, 2024 - Session Notes

##### 1. Loading - Transforming - Embedding - Storing

In [None]:
from langchain.document_loaders import TextLoader

In [None]:
loader = TextLoader(file_path="./Data_Samples/paul_graham_essay.txt")

In [None]:
data = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter

In [None]:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap= 125) 

In [None]:
chunks_2 = text_splitter.split_documents(data)

In [None]:
token_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=200)

In [None]:
chunks_by_tokens = token_splitter.split_documents(data)

In [None]:
llm = ChatOpenAI(temperature=0)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
hf_embedding_function = HuggingFaceEmbeddings()

In [None]:
from langchain_community.vectorstores import Chroma
import chromadb

In [None]:
# loading the existing the vectorDB
freshly_loaded_donut = Chroma(embedding_function=embedding_function, persist_directory="./state_db")


In [None]:
result = freshly_loaded_donut.similarity_search("Who ran American office for economic vision?", k=1)

In [None]:
print (len(result))

In [None]:
print (result)

##### 2. Retriever

In [None]:
retriever = freshly_loaded_donut.as_retriever()

In [None]:
original_result= retriever.get_relevant_documents("Who ran American office for economic vision?")

In [None]:
pretty_print_docs (original_result)

##### 2(a) Multi Query Retriever

A Multi Query Retriever is a technique used in Retrieval-Augmented Generation (RAG) models where an Language Model (LLM) is used automate the process of tuning. This generates multiple queries from different perspectives for a given user input question. The idea is to gather more information and different viewpoints from the documents in the vector store, which can help in providing a more comprehensive and accurate answer. However, this technique can also have drawbacks such as taking longer to execute and costing more due to increased LLM invocations and/or increased token usage.

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
multi_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)

In [None]:
result = multi_retriever.get_relevant_documents("Who ran American office for economic vision?")

In [None]:
print (result)

In [None]:
# function for printing the docs in an organized manner

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join
     ([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [None]:
pretty_print_docs(result)

##### 2(b) ContextualCompressionRetriever

`ContextualCompressionRetriever` is a component in LangChain that is used for retrieving compressed context from a database or other storage system. It is designed to work with large language models (LLMs) and can be used to provide context to the language model when generating responses. The retriever can compress and store large amounts of text, such as entire documents or conversations, and then retrieve and decompress the relevant portions when needed. This can be useful for applications that require the language model to have access to a large amount of context, but where storing all of the context in memory is not practical. The `ContextualCompressionRetriever` can be used in conjunction with other LangChain modules, such as the language model integration and prompt handling modules, to create advanced language model applications.

In [None]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever

In [None]:
compressor = LLMChainExtractor.from_llm(llm=llm)

In [None]:
contextual_compressor = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

In [None]:
new_result=contextual_compressor.get_relevant_documents("Who ran American office for economic vision?")

In [None]:
pretty_print_docs(new_result)

In [None]:
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI


In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")

In [None]:
from langchain_core.messages import HumanMessage

In [None]:
message = HumanMessage(content="who is the prime minster of UK")

In [None]:
result = llm.invoke([message])

<<< End Of Document >>>