###📄 UnstructuredMarkdownLoader to load a local Markdown file (GITMkdwn.md)

###🪚 MarkdownHeaderTextSplitter to split based on Markdown headers

###📈 Evaluating loader and splitter with performance metrics as discussed earlier

| Component | Loader                                  | Splitter                           |
| --------- | --------------------------------------- | ---------------------------------- |
| Tool Used | `UnstructuredMarkdownLoader`            | `MarkdownHeaderTextSplitter`       |
| Strength  | Preserves Markdown structure            | Header-aware semantic segmentation |
| Metrics   | Character count, token cost, C\:N ratio | Chunk size, metadata, coherence    |
| Coherence | ✅ Optional scoring with STS-B           | ✅ Score output                     |


In [3]:
# 📘 Notebook: UnstructuredMarkdownLoader with MarkdownHeaderTextSplitter
# 🧠 Objective: Load a Markdown file and split it semantically using Markdown headers

# ✅ Step 1: Install required packages
!pip install -q langchain unstructured tiktoken psutil transformers langchain_community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m2.4/2.5 MB[0m [31m72.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# ✅ Step 2: Import necessary modules
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
import time, os, psutil, re
import numpy as np
import tiktoken

In [5]:
# ✅ Step 3: Define helper functions
def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars > 0 else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [7]:
# ✅ Step 4: Load local Markdown file using UnstructuredMarkdownLoader
markdown_path = "/content/GITMkdwn.md"  # Replace with actual file path

In [8]:

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

loader = UnstructuredMarkdownLoader(markdown_path)
docs = loader.load()

end_time = time.time()

In [9]:
final_mem = process.memory_info().rss / 1024 / 1024

# ✅ Step 5: Evaluate loader performance
text = "\n".join([doc.page_content for doc in docs])

In [10]:
loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 Encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "✅ Markdown structure preserved"
}

In [11]:
print("🔍 Loader Performance Metrics (UnstructuredMarkdownLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (UnstructuredMarkdownLoader):
Total Character Count: 1289
Alphanumeric Character Count: 1032
Newline Character Count: 48
Token Count (GPT-4 Encoding): 262
Content-to-Noise Ratio: 0.8006
Processing Time (sec): 7.05
Memory Usage (MB): 326.4
Structural Preservation: ✅ Markdown structure preserved


In [12]:
# ✅ Step 6: Split using MarkdownHeaderTextSplitter
# We'll specify the Markdown headers we expect in the document

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
])

In [13]:
split_start = time.time()
split_docs = splitter.split_text(text)
split_end = time.time()

In [14]:
chunks = [doc.page_content for doc in split_docs]
chunk_lengths = [len(chunk) for chunk in chunks]
chunk_tokens = [count_tokens(chunk) for chunk in chunks]

split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "✔️ Efficient for structural text",
    "Metadata Accuracy": "✅ Based on Markdown header hierarchy"
}

In [16]:
print("\n📊 Splitter Performance Metrics (MarkdownHeaderTextSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (MarkdownHeaderTextSplitter):
Total Chunks: 1
Avg Chunk Size (chars): 1313.0
Chunk Size CV: 0.0
Token Range: 271 - 271
Processing Speed (MB/s): 4.9768
Memory Efficiency: ✔️ Efficient for structural text
Metadata Accuracy: ✅ Based on Markdown header hierarchy


In [17]:
# ✅ Step 7: Semantic Coherence Evaluation (Optional)
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/stsb-roberta-base')

def calculate_semantic_flow(chunks):
    scores = []
    for i in range(min(len(chunks)-1, 20)):
        score = model.predict([chunks[i], chunks[i+1]])
        scores.append(score)
    return round(np.mean(scores), 4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

In [18]:
semantic_score = calculate_semantic_flow(chunks)
print(f"\n🔗 Semantic Flow Score (first 20 chunks): {semantic_score}")


🔗 Semantic Flow Score (first 20 chunks): nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
