# Data Preparation Notebook

This notebook contains utilities for data preprocessing and preparation.

In [46]:
import requests
import os|
import json
import logging
from pathlib import Path
from typing import List

# Semantic chunking imports
try:
    from langchain_experimental.text_splitter import SemanticChunker as LangChainSemanticChunker
    from langchain_core.embeddings import Embeddings
    from sentence_transformers import SentenceTransformer
    LANGCHAIN_SEMANTIC_AVAILABLE = True
except ImportError:
    LANGCHAIN_SEMANTIC_AVAILABLE = False
    print("Warning: LangChain semantic chunker not available, using fallback chunking")

SyntaxError: invalid syntax (1345840862.py, line 2)

In [47]:
def fetch_text_from_url(url: str) -> str:
    """Fetch text content from a URL."""
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        raise requests.RequestException(f"Failed to fetch content from {url}: {e}")

In [48]:
def save_text_as_markdown(text: str, filename: str, output_dir: str = ".") -> str:
    """Save text content as a markdown file."""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_path = output_path / f"{filename}.md"
    
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)
    
    return str(file_path)

In [49]:
# Create data directory and fetch text from multiple URLs
urls = [
    "https://context7.com/langchain-ai/langgraph/llms.txt?tokens=600000",
    "https://context7.com/pydantic/pydantic-ai/llms.txt?tokens=100000",
    "https://context7.com/llmstxt/python_langchain_llms_txt/llms.txt?tokens=100000"
]

filenames = [
    "langgraph_llms_data",
    "pydantic_ai_llms_data", 
    "python_langchain_llms_data"
]

data_dir = "data"

# Create data directory
os.makedirs(data_dir, exist_ok=True)
print(f"Created/verified data directory: {data_dir}")

# Fetch content from all URLs
text_contents = []
for i, url in enumerate(urls):
    print(f"\nFetching content from: {url}")
    text_content = fetch_text_from_url(url)
    text_contents.append(text_content)
    
    print(f"Content length: {len(text_content)} characters")
    print(f"First 100 characters: {text_content[:100]}...")

Created/verified data directory: data

Fetching content from: https://context7.com/langchain-ai/langgraph/llms.txt?tokens=600000
Content length: 460281 characters
CODE SNIPPETS
TITLE: Example LangGraph Project Dir...

Fetching content from: https://context7.com/pydantic/pydantic-ai/llms.txt?tokens=100000
Content length: 467984 characters
CODE SNIPPETS
TITLE: Run Pydantic AI Example with ...

Fetching content from: https://context7.com/llmstxt/python_langchain_llms_txt/llms.txt?tokens=100000
Content length: 476392 characters
CODE SNIPPETS
TITLE: LangChain Introduction and Tu...


In [50]:
# Save all content as markdown files in the data directory
for i, (text_content, filename) in enumerate(zip(text_contents, filenames)):
    saved_file = save_text_as_markdown(text_content, filename, data_dir)
    print(f"Content {i+1} saved to: {saved_file}")
    print(f"File size: {os.path.getsize(saved_file)} bytes")
    print()

Content 1 saved to: data\langgraph_llms_data.md
File size: 474337 bytes

Content 2 saved to: data\pydantic_ai_llms_data.md
File size: 483402 bytes

Content 3 saved to: data\python_langchain_llms_data.md
File size: 489407 bytes



In [51]:
from sentence_transformers import SentenceTransformer
class SentenceTransformerEmbeddings(Embeddings):
    """Wrapper for SentenceTransformer models to work with LangChain."""
    
    def __init__(self, model_name: str = 'nomic-ai/nomic-embed-text-v1.5'):
        print(f"Initializing SentenceTransformer with model: {model_name}")
        try:
            self.model = SentenceTransformer(model_name, trust_remote_code=True)
            print(f"Successfully loaded model: {model_name}")
        except Exception as e:
            print(f"Failed to load {model_name}, falling back to BAAI/bge-large-en-v1.5: {e}")
            self.model = SentenceTransformer('BAAI/bge-large-en-v1.5')
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed search docs."""
        print(f"Embedding {len(texts)} documents")
        embeddings = self.model.encode(texts)
        return embeddings.tolist()
    
    def embed_query(self, text: str) -> List[float]:
        """Embed query text."""
        print(f"Embedding query of length: {len(text)}")
        embedding = self.model.encode([text])
        return embedding[0].tolist()

class SemanticChunker:
    """Semantic chunker using LangChain's SemanticChunker and Nomic model."""

    def __init__(self, chunk_size: int = 8192, threshold: float = 0.75):
        self.chunk_size = chunk_size
        self.threshold = threshold

        # Setup logging
        self.logger = logging.getLogger(__name__)
        if not self.logger.handlers:
            handler = logging.StreamHandler()
            handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.INFO)
        
        print(f"Initializing SemanticChunker with chunk_size={chunk_size}, threshold={threshold}")
        
        # Check if semantic chunking is available
        try:
            from langchain_experimental.text_splitter import SemanticChunker as LangChainSemanticChunker
            semantic_available = True
            print("LangChain SemanticChunker is available")
        except ImportError as e:
            semantic_available = False
            print(f"LangChain SemanticChunker not available: {e}")
        
        print(f"LANGCHAIN_SEMANTIC_AVAILABLE: {semantic_available}")
        
        # Initialize semantic chunker
        try:
            if semantic_available:
                print("Creating SentenceTransformer embeddings...")
                # Create embeddings wrapper
                embeddings = SentenceTransformerEmbeddings('nomic-ai/nomic-embed-text-v1.5')
                
                # Calculate min_chunk_size based on chunk_size
                min_chunk_size = max(500, self.chunk_size // 4)
                print(f"Using min_chunk_size: {min_chunk_size}")
                
                # Pass the embeddings wrapper into SemanticChunker
                print("Creating LangChain SemanticChunker...")
                self.semantic_chunker = LangChainSemanticChunker(
                    embeddings=embeddings, 
                    breakpoint_threshold_amount=self.threshold,
                    min_chunk_size=min_chunk_size
                )
                print("LangChain semantic chunker initialized successfully!")
                self.logger.info(f"LangChain semantic chunker ready with Nomic embeddings (chunk_size={chunk_size}, min_chunk_size={min_chunk_size})")
            else:
                print("LangChain experimental not available")
                self.semantic_chunker = None
                self.logger.warning("langchain_experimental not available, using fallback chunking")
        except Exception as e:
            print(f"ERROR initializing semantic chunker: {e}")
            print(f"Exception type: {type(e)}")
            import traceback
            print(f"Full traceback:")
            traceback.print_exc()
            self.logger.warning(f"Semantic chunker failed to initialize, using fallback: {e}")
            self.semantic_chunker = None
    
    def chunk_text(self, text: str, source_file: str) -> List[str]:
        """Split text into semantic chunks."""
        print(f"\n=== Chunking {source_file} ===")
        print(f"Text length: {len(text)} characters")
        print(f"Semantic chunker available: {self.semantic_chunker is not None}")
        
        if self.semantic_chunker:
            try:
                print("Attempting semantic chunking...")
                # Create semantic chunks using LangChain's method
                docs = self.semantic_chunker.create_documents([text])
                chunks = [d.page_content for d in docs]
                
                print(f"Semantic chunking completed. Created {len(chunks)} chunks")
                
                if not chunks:
                    print("WARNING: Semantic chunker produced no chunks. Using fallback.")
                    self.logger.warning(f"Semantic chunker produced no chunks for {source_file}. Using fallback.")
                    return self._fallback_chunker(text)

                # Log chunk length statistics
                chunk_lengths = [len(chunk) for chunk in chunks]
                print(f"Chunk lengths: min={min(chunk_lengths)}, max={max(chunk_lengths)}, avg={sum(chunk_lengths)/len(chunk_lengths):.0f}")
                
                self.logger.info(f"Created {len(chunks)} semantic chunks for {source_file}")
                return chunks
            except Exception as e:
                print(f"ERROR during semantic chunking: {e}")
                print(f"Exception type: {type(e)}")
                import traceback
                print(f"Full traceback:")
                traceback.print_exc()
                self.logger.error(f"Semantic chunking failed during execution for {source_file}: {e}. Using fallback.")
                return self._fallback_chunker(text)
        else:
            print("Using fallback chunker (semantic chunker not available)")
            self.logger.info(f"Using fallback chunker for {source_file}.")
            return self._fallback_chunker(text)

    def _fallback_chunker(self, text: str) -> List[str]:
        """Fallback text chunking method."""
        print(f"=== FALLBACK CHUNKING ===")
        chunk_size = self.chunk_size
        overlap = 500
        chunks = []
        
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            if chunk.strip():
                chunks.append(chunk)
        
        print(f"Fallback chunking created {len(chunks)} chunks")
        return chunks

In [52]:
def save_chunks_as_json(chunks: List[str], source_name: str, output_dir: str = "data/chunks") -> str:
    """
    Save text chunks as JSON with source metadata.
    
    Args:
        chunks: List of text chunks
        source_name: Name of the source file
        output_dir: Directory to save the file in
        
    Returns:
        str: Full path to the saved file
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Create metadata for each chunk
    chunk_data = []
    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "chunk_id": i + 1,
            "source": source_name,
            "content": chunk,
            "chunk_length": len(chunk)
        })
    
    # Save as JSON
    json_filename = f"{source_name}_chunks.json"
    file_path = output_path / json_filename
    
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(chunk_data, f, indent=2, ensure_ascii=False)
    
    return str(file_path)

In [None]:

import logging
import json
# Initialize semantic chunker
chunker = SemanticChunker(chunk_size=8192, threshold=0.75)

# Define the markdown files to process
markdown_files = [
    ("data/langgraph_llms_data.md", "langgraph"),
    ("data/pydantic_ai_llms_data.md", "pydantic_ai"),
    ("data/python_langchain_llms_data.md", "python_langchain")
]

print("Starting semantic chunking process...")

for file_path, source_name in markdown_files:
    print(f"\nProcessing {file_path}...")
    
    try:
        # Read the markdown file
        with open(file_path, 'r', encoding='utf-8') as f:
            text_content = f.read()
        
        print(f"File length: {len(text_content)} characters")
        
        # Create semantic chunks
        chunks = chunker.chunk_text(text_content, source_name)
        
        print(f"Created {len(chunks)} chunks for {source_name}")
        
        # Save chunks with metadata
        saved_file = save_chunks_as_json(chunks, source_name)
        print(f"Chunks saved to: {saved_file}")
        
        # Show chunk statistics
        chunk_lengths = [len(chunk) for chunk in chunks]
        avg_length = sum(chunk_lengths) / len(chunk_lengths) if chunks else 0
        print(f"Average chunk length: {avg_length:.0f} characters")
        print(f"Min chunk length: {min(chunk_lengths) if chunks else 0}")
        print(f"Max chunk length: {max(chunk_lengths) if chunks else 0}")
        
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found. Skipping...")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("\nSemantic chunking complete!")

Initializing SemanticChunker with chunk_size=8192, threshold=0.75
LangChain SemanticChunker is available
LANGCHAIN_SEMANTIC_AVAILABLE: True
Creating SentenceTransformer embeddings...
Initializing SentenceTransformer with model: nomic-ai/nomic-embed-text-v1.5


Encountered exception while importing einops: No module named 'einops'


Failed to load nomic-ai/nomic-embed-text-v1.5, falling back to BAAI/bge-large-en-v1.5: This modeling file requires the following packages that were not found in your environment: einops. Run `pip install einops`


2025-08-22 08:42:31,218 - INFO - LangChain semantic chunker ready with Nomic embeddings (chunk_size=8192, min_chunk_size=2048)


Using min_chunk_size: 2048
Creating LangChain SemanticChunker...
LangChain semantic chunker initialized successfully!
Starting semantic chunking process...

Processing data/langgraph_llms_data.md...
File length: 460281 characters

=== Chunking langgraph ===
Text length: 460281 characters
Semantic chunker available: True
Attempting semantic chunking...
Embedding 1283 documents
