In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


### 1. Upload the `document.pdf` file

In [4]:
# Define the file path:
PDF_FILE_PATH = "document.pdf"

# Initialize PyPDFLoader with the file path:
loader = PyPDFLoader(PDF_FILE_PATH)

# Load the document content:
documents = loader.load()

# Print summary if successful:
print(f"✅ Document loaded successfully!")
print(f"Total pages/documents loaded: {len(documents)}")
print("-" * 50)
print("Snippet of Page 01 Content (First 200 chars):")
print(documents[0].page_content[:200] + "...")
print("-" * 50)
print(f"Metadata of Page 01: {documents[0].metadata}")

✅ Document loaded successfully!
Total pages/documents loaded: 15
--------------------------------------------------
Snippet of Page 01 Content (First 200 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
--------------------------------------------------
Metadata of Page 01: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'document.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


### 2. Split the document into chunks

In [5]:
# 1. Initialize the Text Splitter:
CHUNK_SIZE = 1000     # No. of characters in each chunk
CHUNK_OVERLAP = 200   # No. of overlapping characters between adjacent chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    is_separator_regex=False
)

# 2. Split the documents:
chunked_documents = text_splitter.split_documents(documents)    # processes the list of large documents into a list of smaller ones

# Verification:
print(f"✅ Document successfully split into chunks!")
print(f"Original number of pages/documents: {len(documents)}")
print(f"Total number of chunks created: {len(chunked_documents)}")
# Show that metadata is preserved (e.g., page number):
print(f"Example Chunk Metadata: {chunked_documents[0].metadata}")
print(f"Example Chunk Content (Length: {len(chunked_documents[0].page_content)}):")
print(chunked_documents[0].page_content[:300] + "...")

✅ Document successfully split into chunks!
Original number of pages/documents: 15
Total number of chunks created: 52
Example Chunk Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'document.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}
Example Chunk Content (Length: 986):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Par...
