In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
import faiss
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
from langchain_classic.chains import RetrievalQA

### 1. Upload the `document.pdf` file

In [4]:
# Define the file path:
PDF_FILE_PATH = "document.pdf"

# Initialize PyPDFLoader with the file path:
loader = PyPDFLoader(PDF_FILE_PATH)

# Load the document content:
documents = loader.load()

# Print summary if successful:
print(f"‚úÖ Document loaded successfully!")
print(f"Total pages/documents loaded: {len(documents)}")
print("-" * 50)
print("Snippet of Page 01 Content (First 200 chars):")
print(documents[0].page_content[:200] + "...")
print("-" * 50)
print(f"Metadata of Page 01: {documents[0].metadata}")

‚úÖ Document loaded successfully!
Total pages/documents loaded: 15
--------------------------------------------------
Snippet of Page 01 Content (First 200 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
--------------------------------------------------
Metadata of Page 01: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'document.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


### 2. Split the document into chunks

In [5]:
# 1. Initialize the Text Splitter:
CHUNK_SIZE = 1000     # No. of characters in each chunk
CHUNK_OVERLAP = 200   # No. of overlapping characters between adjacent chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    is_separator_regex=False
)

# 2. Split the documents:
chunked_documents = text_splitter.split_documents(documents)    # processes the list of large documents into a list of smaller ones

# Verification:
print(f"‚úÖ Document successfully split into chunks!")
print(f"Original number of pages/documents: {len(documents)}")
print(f"Total number of chunks created: {len(chunked_documents)}")
# Show that metadata is preserved (e.g., page number):
print(f"Example Chunk Metadata: {chunked_documents[0].metadata}")
print(f"Example Chunk Content (Length: {len(chunked_documents[0].page_content)}):")
print(chunked_documents[0].page_content[:300] + "...")

‚úÖ Document successfully split into chunks!
Original number of pages/documents: 15
Total number of chunks created: 52
Example Chunk Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'document.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}
Example Chunk Content (Length: 986):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Par...


### 3. Create embeddings in the Vector Store

This is where the document's raw text is transformed into a searchable knowledge base.

In [6]:
# Initialize the Embedding Model:
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"   # good, lightweight, open-source sentence-transformer model

# Load the embedding model:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

print(f"‚úÖ Embedding model '{EMBEDDING_MODEL_NAME}' loaded successfully.")

# Create the FAISS Vector Store:
  # FAISS.from_documents handles the following automatically:
    # Taking each Document (chunk)
    # Generating its embedding using the 'embeddings' model
    # Storing the resulting vector and the original text/metadata in the FAISS database
vector_store = FAISS.from_documents(
    documents=chunked_documents,
    embedding=embeddings
)

print(f"‚úÖ FAISS Vector Store created with {len(chunked_documents)} documents.")
print("-" * 50)

# Save the FAISS index locally, to avoid re-running embedding every time:
FAISS_INDEX_PATH = "faiss_index_rag"
vector_store.save_local(FAISS_INDEX_PATH)
print(f"Index saved locally to folder: {FAISS_INDEX_PATH}")

‚úÖ Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.
‚úÖ FAISS Vector Store created with 52 documents.
--------------------------------------------------
Index saved locally to folder: faiss_index_rag


### 4. Add LLM of choice

Choose a model optimized for question answering and text generation, often built 
on a **Sequence-to-Sequence (Seq2Seq)** architecture like **T5** or **BART**.

Use the **Flan-T5** model‚éØa powerful, relatively small, and effective LLM for 
this type of task.

Requires the `AutoTokenizer` and `AutoModelForSeq2SeqLM` imports, along with the 
LangChain utility `RetrievalQA` to combine the retriever and the LLM.

In [8]:
# Initialize the Generative LLM (Flan-T5):
LLM_MODEL_NAME = "google/flan-t5-base"    # good balance of size and performance

# Load Tokenizer and Model:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)

# Create the HuggingFace Pipepline for Text-to-Text Generation:
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,     # max length for the generated answer
    temperature=0.1,    # lower temperature for factual answers
    do_sample=True
)

# Wrap the Pipeline in a LangChain LLM object:
  # Allows the LLM to be easily used in the LangChain framework.
llm = HuggingFacePipeline(pipeline=pipe)

print(f"‚úÖ Generative LLM '{LLM_MODEL_NAME}' loaded and configured.")

# Connect the Retriever to the LLM (Building the RAG Chain):
  # Create the Retriever from the FAISS Vector Score.
    # The retriever component knows how to perform the vector search.
    # Ask it to return the top 3 most relevant chunks (k=3)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# Build the RAG Chain (RetrievalQA):
	# Automatically handles:
		# a) Taking the user query
    # b) Calling the 'retriever' to get relevant chunks
    # c) Formatting the chunks and the query into a single prompt
    # d) Feeding the prompt to the 'llm' for final answer generation
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",		# stuffs all retrieved chunks into the prompt
    retriever=retriever,
    return_source_documents=True		# for verification later
)

print("‚úÖ RAG Chain (RetrievalQA) successfully constructed.")
print("-" * 50)

# Test the Pipeline (Initial Query):
	# Use a question specific to document.pdf.
RAG_QUERY = '''What is the primary motivation for using the attention mechanism 
instead of recurrence and convolution in the Transformer model?'''

print(f"‚ùì Querying the RAG system: {RAG_QUERY}")
result = qa_chain.invoke({"query": RAG_QUERY})

print("\nü§ñ RAG Answer:")
print(result['result'])
print("\nüìú Source Document Metadata (to verify retrieval):")
for document in result['source_documents']:
    print(f"- Page: {document.metadata.get('page', 'N/A') + 1}, Source: {document.metadata.get('source', 'document.pdf')}")

Device set to use mps:0
Token indices sequence length is longer than the specified maximum sequence length for this model (692 > 512). Running this sequence through the model will result in indexing errors


‚úÖ Generative LLM 'google/flan-t5-base' loaded and configured.
‚úÖ RAG Chain (RetrievalQA) successfully constructed.
--------------------------------------------------
‚ùì Querying the RAG system: What is the primary motivation for using the attention mechanism 
instead of recurrence and convolution in the Transformer model?

ü§ñ RAG Answer:
the amount of computation that can be parallelized

üìú Source Document Metadata (to verify retrieval):
- Page: 5, Source: document.pdf
- Page: 2, Source: document.pdf
- Page: 6, Source: document.pdf
