In [1]:
# Notebook: UnstructuredWordDocumentLoader with RecursiveCharacterTextSplitter
# Goal: Load a Word (.docx) document and split it into structure-aware chunks

# Step 1: Install dependencies
!pip install -q langchain unstructured python-docx tiktoken psutil langchain_community langchain_experimental

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m624.6/981.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m18.2

In [2]:
# Step 2: Import libraries
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time, os, psutil, re, numpy as np
import tiktoken

In [3]:
# Step 3: Utility functions for metrics

def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [4]:
# Step 4: Load the .docx file
file_path = "/content/HD-Titles.docx"  # Ensure this is in your working directory

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

loader = UnstructuredWordDocumentLoader(file_path)
docs = loader.load()

end_time = time.time()
final_mem = process.memory_info().rss / 1024 / 1024

In [5]:
# Step 5: Loader performance metrics
text = "\n".join([doc.page_content for doc in docs])

loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "⚠️ Limited – some formatting may be lost"
}

print("🔍 Loader Performance Metrics (UnstructuredWordDocumentLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (UnstructuredWordDocumentLoader):
Total Character Count: 1706
Alphanumeric Character Count: 1406
Newline Character Count: 10
Token Count (GPT-4 encoding): 330
Content-to-Noise Ratio: 0.8242
Processing Time (sec): 5.03
Memory Usage (MB): 290.23
Structural Preservation: ⚠️ Limited – some formatting may be lost


In [6]:
# Step 6: Split the loaded content using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)

split_start = time.time()
split_docs = splitter.split_documents(docs)
split_end = time.time()

chunks = [doc.page_content for doc in split_docs]
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

In [7]:
split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✅ High",
    "Metadata Accuracy": "❌ N/A – no semantic metadata added"
}

print("\n📊 Splitter Performance Metrics (RecursiveCharacterTextSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (RecursiveCharacterTextSplitter):
Total Chunks: 3
Avg Chunk Size (chars): 567.33
Chunk Size CV: 0.2266
Token Range: 89 - 145
Processing Speed (MB/s): 4.0936
Memory Efficiency: ✅ High
Metadata Accuracy: ❌ N/A – no semantic metadata added


Summary
Component	Loader	Splitter<br>
Tool Used	UnstructuredWordDocumentLoader	RecursiveCharacterTextSplitter<br>
File Format	.docx	Generic paragraph-aware chunks<br>
Structure	⚠️ Limited formatting retained	✅ Contextual text preservation<br>
Metrics	Token count, C:N ratio, speed	Chunk CV, throughput<br>
Ideal Use	Reports, prose, narratives	General-purpose splitting