The standard RAG workflow

<image src="./images/standard_rag_workflow.png" alt="RAG Workflow" width="600">

-----

Preparing data for retrieval

In [None]:
# Document Loaders
from langchain_core.document_loaders.base import CSVLoader
csv_loader = CSVLoader(file_path = "path/to/your/file.csv")
documents = csv_loader.load()
print(documents)


# Loading PDF files
from langchain_core.document_loaders.base import PyPDFLoader
pdf_loader = PyPDFLoader(file_path = "path/to/your/file.pdf")
documents = pdf_loader.load()
print(documents)


# Loading html files
from langchain_core.document_loaders.base import UnstructuredHTMLLoader
html_loader = UnstructuredHTMLLoader(file_path = "path/to/your/file.html")
documents = html_loader.load()
first_document = documents[0]

print("Content: ", first_document.page_content)
print("Metadata: ", first_document.metadata)

Perparing data for retreival

In [None]:
# Chunk Size
#    - Big chunks are slow and difficult to interpret
#    - Small chunks are fast but may lose context

# Chunk Overlap
#    - Overlapping chunks can help maintain context

from langchain_text_splitters import CharacterTextSplitter

text = """Machine learning is a fascinating field.\n\nIt involves algorithms and models that 
can learn from data. These models can then make predictions or decisions without being 
explicitly programmed to perform the task. \nThis capability is increasingly valuable in 
various industries, from finance to healthcare. \n\nThere are many types of machine Learning, 
including supervised, unsupervised, and reinforcement learning. \nEach type has its own strengths and applications. """

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=100,
    chunk_overlap=10
)

chunks = text_splitter.split_text(text)
print(chunks)
print([len(chunk) for chunk in chunks])

In [None]:
# Recursive Text Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    separators=["\n\n", "\n", " ", ""]
)

chunks = splitter.split_text(text)
print(chunks)
print([len(chunk) for chunk in chunks])

Embedding and sotring the chunks

In [None]:
! pip install sentence-transformers chromadb

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding_model = HuggingFaceEmbeddings (
  model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vector_store = Chroma.from_documents(
  documents=chunks, 
  embedding=embedding_model
)

Langchain expression language (LCEL) for RAG
  - It is a declarative syntax for describing chains from prototypes to production
  - It create modular retrieval pipelines which can combine retrieval and generation components together

<image src="./images/lcel.png" alt="RAG Workflow" width="600">

In [None]:
vector_store = Chroma.from_documents(
  documents=chunks, 
  embedding=embedding_model
)

# Instantiate a retriever
retriever = vector_store.as_retriever(
  search_type="similarity", 
  search_kwargs={"k": 2}
)



# Creating a prompt template
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
  Use the following pieces of context to answer the question at the end.
  If you don't know the answer, say that you don't know.
  Context: {context}
  Question: {question}
""")



# Building a lcel retrieval chain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough() } # RunnblePassthrough allows you to pass the question directly to the chain
    | prompt
    | llm
    | StrOutputParser()
)

result = chain.invoke({"question": "What are the key findings or results presented in the paper?"})
print(result)

-------

Extending the data retrieval with other files

In [None]:
# Loading markdown files
from langchain_core.document_loaders.base import UnstructuredMarkdownLoader
markdown_loader = UnstructuredMarkdownLoader(file_path="path/to/your/file.md")
markdown_content = markdown_loader.load()
print(markdown_content[0])

In [None]:
# Loading python files
from langchain_core.document_loaders.base import PythonLoader
python_loader = PythonLoader(file_path="path/to/your/file.py")
python_content = python_loader.load()
print(python_content[0])

In [None]:
# Splitting the code files
from langchain_text_splitters import Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
  language=Language.PYTHON,
  chunk_size=150,
  chink_overlap=10,
)

chunks = python_splitter.split_documents(python_data)
for i, chunk in enumerate(chunks[:3]):
  print(f"Chunk {i+1}:\n{chunk.page_content}\n")

------

Advanced Splitting Methods

<image src="./images/limitation of splitting.png" alt="RAG Workflow" width="600">

In [None]:
# Splitting tokens
import tiktoken
from langchain_text_splitters import TokenTextSplitter

example_string = "Mary had a little lamb, it's fleece was white as snow."

encoding = tiktoken.encoding_for_model('gpt-4o-mini')
splitter = TokenTextSplitter(
  encoding_name=encoding.name,
  chunk_size=10, 
  chunk_overlap=2
)

chunks = splitter.split_text(example_string)

for i, chunk in enumerate (chunks) :
  print(f"Chunk {i+1}: nNo. tokens: {len(encoding.encode(chunk))}\n{chunk}\n")

Semantic Splitting

In [None]:
from langchain_openai import OpenAIEmbeddings 
from langchain_experimental.text_splitter import SemanticChunker

embeddings = OpenAIEmbeddings(api_key="...", model='text-embedding-3-small')

semantic_splitter = SemanticChunker(
    embeddings=embeddings, 
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=0.8
)

chunks = semantic_splitter.split_documents(document)
print(chunks[0])