In [5]:
# 📘 Notebook: JSONLoader with RecursiveJsonSplitter
# 📄 Goal: Load a structured JSON file and split it intelligently by nested fields

# Step 1: Install dependencies
!pip install -q langchain sentence-transformers transformers tiktoken psutil langchain_community langchain_experimental jq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/746.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m553.0/746.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m746.6/746.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Step 2: Import required modules
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveJsonSplitter
import time, os, psutil, json, numpy as np, re
import tiktoken

In [3]:
# Step 3: Utility functions for performance evaluation
def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def content_to_noise(text):
    alphanum = len(re.findall(r'\w', text))
    total_chars = len(text)
    return round(alphanum / total_chars, 4) if total_chars else 0

def chunk_size_cv(lengths):
    mean = np.mean(lengths)
    std = np.std(lengths)
    return round(std / mean, 4) if mean > 0 else 0

In [6]:
# Step 4: Load the JSON file
file_path = "/content/ApiCallTypes.json"  # Ensure this file is in the working directory

start_time = time.time()
process = psutil.Process(os.getpid())
initial_mem = process.memory_info().rss / 1024 / 1024

loader = JSONLoader(file_path=file_path, jq_schema='.', text_content=False)
docs = loader.load()

end_time = time.time()
final_mem = process.memory_info().rss / 1024 / 1024

In [8]:
# Step 5: Loader Metrics
text = "\n".join([doc.page_content for doc in docs])

loader_metrics = {
    "Total Character Count": len(text),
    "Alphanumeric Character Count": len(re.findall(r'\w', text)),
    "Newline Character Count": text.count("\n"),
    "Token Count (GPT-4 encoding)": count_tokens(text),
    "Content-to-Noise Ratio": content_to_noise(text),
    "Processing Time (sec)": round(end_time - start_time, 2),
    "Memory Usage (MB)": round(final_mem - initial_mem, 2),
    "Structural Preservation": "High (via JSON schema)"
}

print("🔍 Loader Performance Metrics (JSONLoader):")
for k, v in loader_metrics.items():
    print(f"{k}: {v}")

🔍 Loader Performance Metrics (JSONLoader):
Total Character Count: 2769
Alphanumeric Character Count: 2071
Newline Character Count: 0
Token Count (GPT-4 encoding): 643
Content-to-Noise Ratio: 0.7479
Processing Time (sec): 0.02
Memory Usage (MB): 0.0
Structural Preservation: High (via JSON schema)


In [14]:
# Step 6: Apply RecursiveJsonSplitter
# splitter = RecursiveJsonSplitter(max_chunk_size=1000, max_depth=10)
with open('/content/ApiCallTypes.json', "r") as f:
    json_data = json.load(f)  # Load raw structured JSON

splitter = RecursiveJsonSplitter(max_chunk_size=1000)
split_start = time.time()
chunks = splitter.split_json(json_data=json_data)
split_end = time.time()

chunk_texts = [json.dumps(chunk) for chunk in chunks]  # Convert dicts to strings
chunk_lengths = [len(text) for text in chunk_texts]
chunk_tokens = [count_tokens(text) for text in chunk_texts]


In [15]:
split_metrics = {
    "Total Chunks": len(chunks),
    "Avg Chunk Size (chars)": round(np.mean(chunk_lengths), 2),
    "Chunk Size CV": chunk_size_cv(chunk_lengths),
    "Token Range": f"{min(chunk_tokens)} - {max(chunk_tokens)}",
    "Processing Speed (MB/s)": round((len(text)/1024/1024) / (split_end - split_start), 4),
    "Memory Efficiency": "High",
    "Metadata Accuracy": "Preserved nested context"
}

print("\n📊 Splitter Performance Metrics (RecursiveJsonSplitter):")
for k, v in split_metrics.items():
    print(f"{k}: {v}")


📊 Splitter Performance Metrics (RecursiveJsonSplitter):
Total Chunks: 1
Avg Chunk Size (chars): 2769.0
Chunk Size CV: 0.0
Token Range: 643 - 643
Processing Speed (MB/s): 10.6092
Memory Efficiency: High
Metadata Accuracy: Preserved nested context


In [16]:
"""Summary
Component	Loader	Splitter
Tool Used	JSONLoader	RecursiveJsonSplitter
Format	Structured JSON	Hierarchical key splits
Best For	API responses, MongoDB exports	Nested logical segmentation
Strengths	Field preservation	Semantic hierarchy
Eval Metrics	Token count, C:N ratio, memory	Chunk CV, speed, structure"""

'Summary\nComponent\tLoader\tSplitter\nTool Used\tJSONLoader\tRecursiveJsonSplitter\nFormat\tStructured JSON\tHierarchical key splits\nBest For\tAPI responses, MongoDB exports\tNested logical segmentation\nStrengths\tField preservation\tSemantic hierarchy\nEval Metrics\tToken count, C:N ratio, memory\tChunk CV, speed, structure'