# Final Project: AI RAG Assistant Using LangChain

In [24]:
# Install required packages
!pip install chromadb
!pip install langchain_community
!pip install pypdfium2
!pip install sentence-transformers
!pip install wget

# Suppress warnings and clear all output from PIP
import warnings
from IPython.display import clear_output
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')
clear_output()

## Task 1: Load document using LangChain

In [11]:
from langchain_community.document_loaders import PyPDFium2Loader
from pprint import pprint

pdf_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WgM1DaUn2SYPcCg_It57tA/A-Comprehensive-Review-of-Low-Rank-Adaptation-in-Large-Language-Models-for-Efficient-Parameter-Tuning-1.pdf"
loader = PyPDFium2Loader(pdf_url)
data = loader.load()
pprint(data[0].page_content[:1000])

('A Comprehensive Review of Low-Rank\n'
 'Adaptation in Large Language Models for\n'
 'Efficient Parameter Tuning\n'
 'September 10, 2024\n'
 'Abstract\n'
 'Natural Language Processing (NLP) often involves pre-training large\n'
 'models on extensive datasets and then adapting them for specific tasks\n'
 'through fine-tuning. However, as these models grow larger, like GPT-3\n'
 'with 175 billion parameters, fully fine-tuning them becomes '
 'computa\x02tionally expensive. We propose a novel method called LoRA '
 '(Low-Rank\n'
 'Adaptation) that significantly reduces the overhead by freezing the '
 'orig\x02inal model weights and only training small rank decomposition '
 'matrices.\n'
 'This leads to up to 10,000 times fewer trainable parameters and reduces\n'
 'GPU memory usage by three times. LoRA not only maintains but some\x02times '
 'surpasses fine-tuning performance on models like RoBERTa, De\x02BERTa, '
 'GPT-2, and GPT-3. Unlike other methods, LoRA introduces\n'
 'no extra laten

## Task 2: Apply text splitting techniques

In [12]:
latex_text = r"""

    \documentclass{article}

    \begin{document}

    \maketitle

    \section{Introduction}

    Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in various natural language processing tasks, including language translation, text generation, and sentiment analysis.

    \subsection{History of LLMs}

    The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
    
    \subsection{Applications of LLMs}
    
    LLMs have many applications in the industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
    
    \end{document}
    
"""

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

latex_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.LATEX,
    chunk_size=60,
    chunk_overlap=0,
)
latex_docs = latex_splitter.create_documents([latex_text])
latex_docs

[Document(metadata={}, page_content='\\documentclass{article}\n\n    \\begin{document}'),
 Document(metadata={}, page_content='\\maketitle\n\n    \\section{Introduction}\n\n    Large language'),
 Document(metadata={}, page_content='models (LLMs) are a type of machine learning model that can'),
 Document(metadata={}, page_content='be trained on vast amounts of text data to generate'),
 Document(metadata={}, page_content='human-like language. In recent years, LLMs have made'),
 Document(metadata={}, page_content='significant advances in various natural language processing'),
 Document(metadata={}, page_content='tasks, including language translation, text generation, and'),
 Document(metadata={}, page_content='sentiment analysis.\n\n    \\subsection{History of LLMs}'),
 Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),
 Document(metadata={}, page_content='but they were limited by the amount of data that could be'),
 Document(metadata={}, page_

## Task 3: Embed documents

In [18]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)

query = "How are you?"
query_result = embedding.embed_query(query)
query_result[:5]

[0.02710619755089283,
 0.011331832036376,
 -0.0019523875089362264,
 -0.036951325833797455,
 0.01776488684117794]

## Task 4: Create and configure vector databases to store embeddings

In [27]:
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the text from file:
# uncomment the following three lines if new-Policies.txt is not yet downloaded
# import wget
# file_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt"
# file_name = wget.download(file_url)
file_name = "new-Policies.txt"
loader = TextLoader(file_name)
data = loader.load()

# Split text into chunks:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
chunks = text_splitter.split_documents(data)

# Create the vector database:
# note: use the embedding from Task 3
ids = [str(i) for i in range(len(chunks))]
vector_db = Chroma.from_documents(chunks, embedding, ids=ids)

# Conduct similarity search:
query = "Smoking policy"
docs = vector_db.similarity_search(query, k=5)  # show top 5 results
docs

[Document(metadata={'source': 'new-Policies.txt'}, page_content='Our Mobile Phone Policy defines standards for responsible use of mobile devices within the'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='4. Mobile Phone Policy'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='Consequences: Non-compliance with this policy may result in disciplinary actions, including'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy promotes the safe and responsible use of digital communication tools in line with our')]

## Task 5: Develop a retriever to fetch document segments based on queries

In [28]:
# Note: The file new-Policies.txt is already loaded and saved as a Chroma
# database from Task 4 and stored in `vector_db`.
retriever = vector_db.as_retriever(search_kwargs={"k": 2})
query = "Email policy"
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'new-Policies.txt'}, page_content='3. Internet and Email Policy'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='Our Internet and Email Policy ensures the responsible and secure use of these tools within our')]