# Document Processing with LangChain

This notebook defines and uses a class to process files, chunk them, generate embeddings, and store them in a structured format.

## Define the Document Processor Class

We'll create a class that handles:
1. Loading documents from file paths
2. Chunking documents into smaller pieces
3. Creating embeddings for chunks
4. Storing processed chunks and their embeddings

In [None]:
import os
from typing import List, Dict, Any, Optional, Union
import pandas as pd

from langchain_community.document_loaders import (
    TextLoader, 
    PyPDFLoader, 
    CSVLoader,
    UnstructuredMarkdownLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

class DocumentProcessor:
    """
    A class to process documents by loading, chunking, embedding, and storing them.
    """
    
    def __init__(self, file_paths: List[str] = None, embedding_model: str = "openai"):
        """
        Initialize the DocumentProcessor with file paths and embedding model choice.
        
        Args:
            file_paths: List of paths to documents that should be processed
            embedding_model: Type of embedding model to use ('openai' or 'huggingface')
        """
        self.file_paths = file_paths or []
        self.documents = []
        self.chunks = []
        self.embedding_model_name = embedding_model
        self.embeddings = None
        self._setup_embeddings()
        
    def _setup_embeddings(self):
        """Set up the embedding model based on the configuration."""
        if self.embedding_model_name == "openai":
            self.embeddings = OpenAIEmbeddings()
        elif self.embedding_model_name == "huggingface":
            self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        else:
            raise ValueError(f"Unsupported embedding model: {self.embedding_model_name}")
    
    def add_file_path(self, file_path: str):
        """Add a file path to the processor."""
        if os.path.exists(file_path):
            self.file_paths.append(file_path)
        else:
            raise FileNotFoundError(f"File not found: {file_path}")
            
    def load_documents(self):
        """Load documents from file paths."""
        documents = []
        
        for file_path in self.file_paths:
            try:
                if file_path.endswith('.txt'):
                    loader = TextLoader(file_path)
                elif file_path.endswith('.pdf'):
                    loader = PyPDFLoader(file_path)
                elif file_path.endswith('.csv'):
                    loader = CSVLoader(file_path)
                elif file_path.endswith('.md'):
                    loader = UnstructuredMarkdownLoader(file_path)
                else:
                    print(f"Unsupported file type: {file_path}")
                    continue
                    
                file_documents = loader.load()
                for doc in file_documents:
                    doc.metadata['source_file'] = os.path.basename(file_path)
                documents.extend(file_documents)
                print(f"Loaded {len(file_documents)} documents from {file_path}")
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
                
        self.documents = documents
        return self.documents

## Chunk Files

Next, we'll implement the method to divide documents into smaller chunks for better processing.

In [None]:
    def chunk_documents(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Split documents into smaller chunks.
        
        Args:
            chunk_size: The target size of each chunk
            chunk_overlap: The overlap between chunks
        
        Returns:
            List of document chunks
        """
        if not self.documents:
            self.load_documents()
            
        if not self.documents:
            print("No documents to chunk!")
            return []
            
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", " ", ""]
        )
        
        self.chunks = text_splitter.split_documents(self.documents)
        print(f"Created {len(self.chunks)} chunks from {len(self.documents)} documents")
        
        return self.chunks

## Embed Files

Now we'll create a method to generate embeddings for the file chunks using the configured embedding model.

In [None]:
    def embed_chunks(self):
        """
        Generate embeddings for each chunk and store them with the chunk.
        
        Returns:
            List of dictionaries containing chunks and their embeddings
        """
        if not self.chunks:
            self.chunk_documents()
            
        if not self.chunks:
            print("No chunks to embed!")
            return []
            
        embedded_documents = []
        
        # Process in batches of 100
        batch_size = 100
        for i in range(0, len(self.chunks), batch_size):
            batch = self.chunks[i:i + batch_size]
            texts = [doc.page_content for doc in batch]
            
            try:
                # Generate embeddings
                embeddings = self.embeddings.embed_documents(texts)
                
                # Store chunks with their embeddings
                for j, doc in enumerate(batch):
                    embedded_doc = {
                        'content': doc.page_content,
                        'metadata': doc.metadata,
                        'embedding': embeddings[j]
                    }
                    embedded_documents.append(embedded_doc)
                    
                print(f"Embedded chunks {i} to {i + len(batch) - 1}")
                
            except Exception as e:
                print(f"Error generating embeddings for batch {i}-{i + len(batch) - 1}: {e}")
                
        self.documents = embedded_documents
        return self.documents

## Store in Documents

We'll add methods to process and store the embedded chunks in a format that can be easily queried or exported.

In [None]:
    def process_files(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Process all files: load, chunk, and embed them.
        
        Args:
            chunk_size: The target size of each chunk
            chunk_overlap: The overlap between chunks
            
        Returns:
            Processed documents with embeddings
        """
        self.load_documents()
        self.chunk_documents(chunk_size, chunk_overlap)
        return self.embed_chunks()
    
    def get_documents_df(self):
        """
        Convert documents to a pandas DataFrame for easy analysis.
        
        Returns:
            DataFrame with document content, metadata, and embedding information
        """
        if not self.documents:
            print("No processed documents available!")
            return pd.DataFrame()
            
        # Create a DataFrame without the embeddings (they can make display messy)
        docs_for_df = [
            {
                'content': doc['content'],
                'source_file': doc['metadata'].get('source_file', ''),
                'page': doc['metadata'].get('page', 0),
                'embedding_length': len(doc['embedding'])
            }
            for doc in self.documents
        ]
        
        return pd.DataFrame(docs_for_df)
    
    def save_to_json(self, output_path: str):
        """
        Save the processed documents to a JSON file.
        
        Args:
            output_path: Path to save the JSON file
            
        Returns:
            Path to the saved file
        """
        if not self.documents:
            print("No documents to save!")
            return None
            
        # Convert embeddings to lists for JSON serialization
        json_docs = []
        for doc in self.documents:
            doc_copy = doc.copy()
            doc_copy['embedding'] = doc_copy['embedding'].tolist() if hasattr(doc_copy['embedding'], 'tolist') else doc_copy['embedding']
            json_docs.append(doc_copy)
            
        import json
        with open(output_path, 'w') as f:
            json.dump(json_docs, f)
            
        print(f"Saved {len(json_docs)} documents to {output_path}")
        return output_path

## Use the Document Processor Class

Let's test our class with some sample files and see the results.

In [None]:
# First, let's create some sample files to process
import tempfile

# Create a temporary directory
temp_dir = tempfile.mkdtemp()
print(f"Created temporary directory: {temp_dir}")

# Create a sample text file
text_path = os.path.join(temp_dir, "sample.txt")
with open(text_path, "w") as f:
    f.write("""# Sample Document
    
This is a sample document that we'll use to test our DocumentProcessor class.

It contains multiple paragraphs with different content to demonstrate chunking.

## Section 1
This section talks about machine learning and its applications.
Machine learning is a subset of artificial intelligence that provides systems 
the ability to automatically learn and improve from experience without being 
explicitly programmed.

## Section 2
This section covers natural language processing (NLP).
NLP is a field of AI that gives computers the ability to understand text and 
spoken words in much the same way human beings can.

## Section 3
This section discusses embeddings in machine learning.
Embeddings are a type of representation that captures the meaning of objects 
such as words, sentences, or documents in a numerical form that computers can process.
""")

# Create a sample Markdown file
md_path = os.path.join(temp_dir, "notes.md")
with open(md_path, "w") as f:
    f.write("""# Research Notes

## Important Concepts

- Vector databases store embeddings efficiently
- Chunking strategies affect retrieval quality
- Embedding models have different strengths

## Implementation Ideas

1. Use recursive chunking for nested documents
2. Compare different embedding models
3. Implement similarity search for retrieval
4. Add metadata filtering capabilities

## Future Work

Consider integrating with other systems and adding visualization tools.
""")

print(f"Created sample files at:\n- {text_path}\n- {md_path}")

In [None]:
# Initialize our DocumentProcessor
# Note: For OpenAI embeddings you would need to have set your API key in the environment
# For demonstration, we'll use HuggingFace embeddings which don't require API keys

processor = DocumentProcessor(embedding_model="huggingface")

# Add our sample files
processor.add_file_path(text_path)
processor.add_file_path(md_path)

# Process the files
processor.process_files(chunk_size=200, chunk_overlap=50)

# Display the resulting documents as a DataFrame
df = processor.get_documents_df()
print(f"Processed {len(df)} document chunks")
df.head()

In [None]:
# Let's examine the length of our chunks
df['content_length'] = df['content'].apply(len)
df[['content_length', 'source_file']].describe()

In [None]:
# Display a few chunks to see how the chunking worked
for i, row in df.head(3).iterrows():
    print(f"--- Chunk {i + 1} from {row['source_file']} ---")
    print(row['content'])
    print("\n")

In [None]:
# Save our processed documents for later use
output_path = os.path.join(temp_dir, "processed_documents.json")
processor.save_to_json(output_path)

## Semantic Search with Embeddings

Now let's implement a simple semantic search function to demonstrate the power of embeddings.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(query, processor, top_k=3):
    """
    Search for documents similar to the query using cosine similarity.
    
    Args:
        query: The search query
        processor: DocumentProcessor instance with embedded documents
        top_k: Number of results to return
        
    Returns:
        List of the top_k most similar documents
    """
    if not processor.documents:
        print("No documents to search!")
        return []
        
    # Get query embedding
    query_embedding = processor.embeddings.embed_query(query)
    
    # Calculate similarities
    similarities = []
    for doc in processor.documents:
        doc_embedding = np.array(doc['embedding'])
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        similarities.append({
            'content': doc['content'],
            'metadata': doc['metadata'],
            'similarity': similarity
        })
    
    # Sort by similarity (highest first)
    sorted_docs = sorted(similarities, key=lambda x: x['similarity'], reverse=True)
    
    # Return top_k results
    return sorted_docs[:top_k]

In [None]:
# Let's try a few search queries
search_queries = [
    "What is machine learning?",
    "How do embeddings work?",
    "Tell me about chunking documents"
]

for query in search_queries:
    print(f"\n--- Search results for: '{query}' ---")
    results = semantic_search(query, processor)
    
    for i, result in enumerate(results):
        print(f"\nResult {i+1} (similarity: {result['similarity']:.4f}):")
        print(f"Source: {result['metadata'].get('source_file', 'unknown')}")
        print(f"Content: {result['content'][:150]}...")

## Cleanup and Conclusion

Our DocumentProcessor class successfully:
1. Loads documents from different file formats
2. Chunks them into manageable pieces
3. Creates embeddings for semantic understanding
4. Stores the processed documents with their metadata
5. Provides utilities for analysis and retrieval

This foundation can be extended for various applications like question answering systems, document search engines, or knowledge bases.

In [None]:
# Clean up temporary files
import shutil
shutil.rmtree(temp_dir)
print(f"Removed temporary directory: {temp_dir}")

print("\nDocumentProcessor demonstration complete!")