In [1]:
# ✅ Step 1: Install required dependencies
!pip install -q langchain PyMuPDF tiktoken transformers langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# ✅ Step 2: Import required libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os, time, psutil, re
import numpy as np
import tiktoken

In [3]:
# ✅ Step 3: Load a PDF using PyMuPDFLoader
# (Replace with your actual file path)
pdf_path = "/content/testPDF.pdf"  # 🔁 Upload your PDF file to Colab or local path

In [4]:
start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024  # in MB

loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

end_time = time.time()

In [5]:
final_mem = process.memory_info().rss / 1024 / 1024

# ✅ Step 4: Loader Performance Metrics
text = "\n".join([doc.page_content for doc in docs])

In [6]:
def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars > 0 else 0

In [7]:
loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 Encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "❌ (No semantic headers preserved)"
}

print("🔍 Loader Performance Metrics (PyMuPDFLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (PyMuPDFLoader):
Total Character Count: 1264
Alphanumeric Character Count: 1031
Newline Character Count: 24
Token Count (GPT-4 Encoding): 264
Content-to-Noise Ratio: 0.8157
Processing Time (sec): 0.11
Memory Usage (MB): 24.56
Structural Preservation: ❌ (No semantic headers preserved)


In [8]:
# ✅ Step 5: Split using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)

In [9]:
split_start = time.time()
split_docs = splitter.split_documents(docs)
split_end = time.time()

In [10]:
chunks = [doc.page_content for doc in split_docs]

# ✅ Step 6: Splitter Performance Metrics
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

In [11]:
def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [12]:
split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✔️ Efficient for large-scale batch",
    "Metadata Accuracy": "❌ No explicit section headers"
}

print("\n📊 Splitter Performance Metrics (RecursiveCharacterTextSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (RecursiveCharacterTextSplitter):
Total Chunks: 3
Avg Chunk Size (chars): 434.67
Chunk Size CV: 0.1098
Token Range: 82 - 96
Processing Speed (MB/s): 1.6252
Memory Efficiency: ✔️ Efficient for large-scale batch
Metadata Accuracy: ❌ No explicit section headers
