In [4]:
# Notebook: UnstructuredPowerPointLoader with CharacterTextSplitter
# Goal: Load a PowerPoint (.ppt) file and split its extracted text using simple character-based rules

# Step 1: Install required packages
!pip install -q langchain unstructured python-pptx tiktoken psutil langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.5 MB[0m [31m16.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m43.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/438.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# Step 2: Import dependencies
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain.text_splitter import CharacterTextSplitter
import time, os, psutil, re, numpy as np
import tiktoken

In [6]:
# Step 3: Define utility functions for performance metrics
def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [7]:
# Step 4: Load the PowerPoint (.ppt) file
file_path = "/content/kafka_session_updated.pptx"  # Ensure this file exists in the local working directory

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

loader = UnstructuredPowerPointLoader(file_path)
docs = loader.load()

end_time = time.time()
final_mem = process.memory_info().rss / 1024 / 1024

In [8]:
# Step 5: Compute loader performance metrics
text = "\n".join([doc.page_content for doc in docs])

loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "⚠️ Moderate – slide separation not guaranteed"
}

print("🔍 Loader Performance Metrics (UnstructuredPowerPointLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (UnstructuredPowerPointLoader):
Total Character Count: 13209
Alphanumeric Character Count: 10490
Newline Character Count: 498
Token Count (GPT-4 encoding): 2690
Content-to-Noise Ratio: 0.7942
Processing Time (sec): 5.71
Memory Usage (MB): 327.45
Structural Preservation: ⚠️ Moderate – slide separation not guaranteed


In [9]:
# Step 6: Apply CharacterTextSplitter
splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=50
)

split_start = time.time()
split_docs = splitter.split_documents(docs)
split_end = time.time()

In [10]:
chunks = [doc.page_content for doc in split_docs]
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

In [11]:
split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✅ High",
    "Metadata Accuracy": "❌ None (no metadata tagging)"
}

print("\n📊 Splitter Performance Metrics (CharacterTextSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (CharacterTextSplitter):
Total Chunks: 20
Avg Chunk Size (chars): 667.7
Chunk Size CV: 0.2112
Token Range: 41 - 212
Processing Speed (MB/s): 12.1267
Memory Efficiency: ✅ High
Metadata Accuracy: ❌ None (no metadata tagging)
