In [7]:
# Notebook: TextLoader with SemanticChunker
# Objective: Load a plain text file and semantically split it using SentenceTransformers-based chunking

# Step 1: Install dependencies
!pip install -q langchain sentence-transformers transformers tiktoken psutil langchain_community langchain_experimental

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
# Step 2: Import required libraries
from langchain_community.document_loaders import TextLoader
# Import SemanticChunker from langchain_experimental
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
import os, time, psutil, re, numpy as np
import tiktoken

In [9]:
# Step 3: Define utility functions for performance metrics
def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [10]:
# Step 4: Load local text file using TextLoader
txt_path = "/content/UseCaseAPICall.txt"  # Ensure this file exists in your local directory

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

loader = TextLoader(txt_path)
docs = loader.load()

end_time = time.time()
final_mem = process.memory_info().rss / 1024 / 1024

# Step 5: Evaluate Loader performance
text = "\n".join([doc.page_content for doc in docs])

loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 Encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "❌ No structure preserved in raw .txt"
}

print("🔍 Loader Performance Metrics (TextLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (TextLoader):
Total Character Count: 2702
Alphanumeric Character Count: 2070
Newline Character Count: 118
Token Count (GPT-4 Encoding): 642
Content-to-Noise Ratio: 0.7661
Processing Time (sec): 0.0
Memory Usage (MB): 0.0
Structural Preservation: ❌ No structure preserved in raw .txt


In [11]:
# Step 6: Use SemanticChunker to split semantically using SBERT
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
splitter = SemanticChunker(embeddings=embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
split_start = time.time()
split_docs = splitter.split_documents(docs)
split_end = time.time()

chunks = [doc.page_content for doc in split_docs]
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✔️ Acceptable with GPU/efficient embedding model",
    "Metadata Accuracy": "N/A (pure semantic chunks)"
}

print("\n📊 Splitter Performance Metrics (SemanticChunker):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (SemanticChunker):
Total Chunks: 3
Avg Chunk Size (chars): 894.33
Chunk Size CV: 0.3292
Token Range: 109 - 273
Processing Speed (MB/s): 0.0007
Memory Efficiency: ✔️ Acceptable with GPU/efficient embedding model
Metadata Accuracy: N/A (pure semantic chunks)


In [13]:
# Step 7: Optional Semantic Flow Coherence Evaluation
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/stsb-roberta-base')

def calculate_semantic_flow(chunks):
    scores = []
    for i in range(min(len(chunks) - 1, 20)):
        score = model.predict([chunks[i], chunks[i+1]])
        scores.append(score)
    return round(np.mean(scores), 4)

semantic_score = calculate_semantic_flow(chunks)
print(f"\n🔗 Semantic Flow Score (first 20 chunks): {semantic_score}")

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.14k [00:00<?, ?B/s]


🔗 Semantic Flow Score (first 20 chunks): 0.4383000135421753


In [None]:
"""Summary Table
Component	Loader	Splitter
Tool Used	TextLoader	SemanticChunker
Format	.txt	Meaning-based
Strength	Fast/simple	Preserves meaning
Structure	❌ Raw	✅ Semantic flow
Evaluation	Char, token, C:N	Chunk CV, Flow score"""