In [6]:
# 📘 Notebook: UnstructuredExcelLoader with CharacterTextSplitter
# 📄 Goal: Load a CSV file as unstructured content and split it using simple character-based logic

# Step 1: Install dependencies
!pip install -q langchain sentence-transformers transformers tiktoken psutil langchain_community langchain_experimental jq unstructured[all-docs]

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/981.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[

In [3]:
# Step 2: Import necessary modules
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter
import os, time, psutil, re, numpy as np
import tiktoken

In [4]:
# Step 3: Utility functions for evaluating loader and splitter performance

def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [8]:
# Step 4: Load the CSV/Excel file
file_path = "/content/ENB2012_data.csv"  # Place your CSV or Excel file in the same directory

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

# Change from UnstructuredExcelLoader to CSVLoader for CSV files
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path)
docs = loader.load()

end_time = time.time()
final_mem = process.memory_info().rss / 1024 / 1024

In [9]:
# Step 5: Calculate Loader Performance Metrics
text = "\n".join([doc.page_content for doc in docs])

loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 Encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "⚠️ Partial – tables may be flattened"
}

print("🔍 Loader Performance Metrics (UnstructuredExcelLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (UnstructuredExcelLoader):
Total Character Count: 70635
Alphanumeric Character Count: 41452
Newline Character Count: 7679
Token Count (GPT-4 Encoding): 58367
Content-to-Noise Ratio: 0.5868
Processing Time (sec): 0.01
Memory Usage (MB): 0.0
Structural Preservation: ⚠️ Partial – tables may be flattened


In [10]:
# Step 6: Split using CharacterTextSplitter
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)

split_start = time.time()
split_docs = splitter.split_documents(docs)
split_end = time.time()

chunks = [doc.page_content for doc in split_docs]
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

In [11]:
split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✅ High",
    "Metadata Accuracy": "❌ N/A (basic character splitter)"
}

print("\n📊 Splitter Performance Metrics (CharacterTextSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (CharacterTextSplitter):
Total Chunks: 768
Avg Chunk Size (chars): 90.97
Chunk Size CV: 0.0018
Token Range: 75 - 75
Processing Speed (MB/s): 3.5782
Memory Efficiency: ✅ High
Metadata Accuracy: ❌ N/A (basic character splitter)


In [12]:
"""Summary Table
Component	Loader	Splitter
Tool Used	UnstructuredExcelLoader	CharacterTextSplitter
Format	.csv	Char-based chunks
Strength	Handles unstructured Excel	Simple & Fast
Structure	⚠️ Flattened table rows	❌ No semantics
Evaluation	Token, char, C:N ratio	Chunk CV, speed"""

'Summary Table\nComponent\tLoader\tSplitter\nTool Used\tUnstructuredExcelLoader\tCharacterTextSplitter\nFormat\t.csv\tChar-based chunks\nStrength\tHandles unstructured Excel\tSimple & Fast\nStructure\t⚠️ Flattened table rows\t❌ No semantics\nEvaluation\tToken, char, C:N ratio\tChunk CV, speed'