In [1]:
import os
import getpass
from pathlib import Path

# Set OpenAI API Key
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")


In [2]:
import glob
import json
from datetime import datetime
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document

# Configuration
CONFIG = {
    "source_dir": "amatol_ocr/amatol_sojourn",
    "embeddings_dir": "embeddings2",
    "faiss_index_dir": "embeddings2/faiss_index",
    "metadata_file": "embeddings2/embedding_metadata.json",
    "chunk_size": 500,
    "chunk_overlap": 50,
    "embedding_model": "text-embedding-3-small"
}

# Create embeddings directory if it doesn't exist
os.makedirs(CONFIG["embeddings_dir"], exist_ok=True)

print("Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


Configuration loaded:
  source_dir: amatol_ocr/amatol_sojourn
  embeddings_dir: embeddings2
  faiss_index_dir: embeddings2/faiss_index
  metadata_file: embeddings2/embedding_metadata.json
  chunk_size: 500
  chunk_overlap: 50
  embedding_model: text-embedding-3-small


In [3]:
# File Discovery Functions
def discover_txt_files(source_dir):
    """Recursively find all .txt files in the source directory."""
    txt_files = []
    pattern = os.path.join(source_dir, "**", "*.txt")
    txt_files = glob.glob(pattern, recursive=True)
    return sorted(txt_files)

def get_file_stats(file_paths):
    """Get statistics about the files."""
    stats = {
        "total_files": len(file_paths),
        "total_size_bytes": 0,
        "subdirectories": set(),
        "file_types": {}
    }
    
    for file_path in file_paths:
        # Get file size
        stats["total_size_bytes"] += os.path.getsize(file_path)
        
        # Get subdirectory
        subdir = os.path.dirname(file_path).replace(CONFIG["source_dir"], "").strip("/")
        if subdir:
            stats["subdirectories"].add(subdir)
        
        # Count by subdirectory
        if subdir not in stats["file_types"]:
            stats["file_types"][subdir] = 0
        stats["file_types"][subdir] += 1
    
    stats["subdirectories"] = sorted(list(stats["subdirectories"]))
    stats["total_size_mb"] = round(stats["total_size_bytes"] / (1024 * 1024), 2)
    
    return stats

# Discover files
print("Discovering .txt files...")
txt_files = discover_txt_files(CONFIG["source_dir"])
file_stats = get_file_stats(txt_files)

print(f"\nFound {file_stats['total_files']} .txt files")
print(f"Total size: {file_stats['total_size_mb']} MB")
print(f"Subdirectories: {len(file_stats['subdirectories'])}")

print("\nFiles by subdirectory:")
for subdir, count in file_stats["file_types"].items():
    display_name = subdir if subdir else "root"
    print(f"  {display_name}: {count} files")

print(f"\nFirst 5 files:")
for i, file_path in enumerate(txt_files[:5]):
    print(f"  {i+1}. {file_path}")
    
if len(txt_files) > 5:
    print(f"  ... and {len(txt_files) - 5} more")


Discovering .txt files...

Found 1 .txt files
Total size: 0.01 MB
Subdirectories: 0

Files by subdirectory:
  root: 1 files

First 5 files:
  1. amatol_ocr/amatol_sojourn/amatol_sojourn_article.txt


In [4]:
# Document Loading and Processing Functions
def load_documents(file_paths):
    """Load all documents with enhanced metadata."""
    docs = []
    failed_files = []
    
    for i, file_path in enumerate(file_paths):
        try:
            # Load document
            loader = TextLoader(file_path, encoding="utf-8")
            content = loader.load()
            
            # Enhance metadata for each document
            for doc in content:
                # Extract subdirectory and filename
                rel_path = os.path.relpath(file_path, CONFIG["source_dir"])
                subdir = os.path.dirname(rel_path)
                filename = os.path.basename(file_path)
                
                # Enhanced metadata
                doc.metadata.update({
                    "source_file": filename,
                    "source_path": file_path,
                    "relative_path": rel_path,
                    "subdirectory": subdir,
                    "file_size_bytes": os.path.getsize(file_path),
                    "load_timestamp": datetime.now().isoformat()
                })
                
            docs.extend(content)
            
            # Progress update
            if (i + 1) % 10 == 0:
                print(f"Loaded {i + 1}/{len(file_paths)} files...")
                
        except Exception as e:
            print(f"Failed to load {file_path}: {e}")
            failed_files.append(file_path)
    
    print(f"\nLoading complete:")
    print(f"  Successfully loaded: {len(docs)} documents")
    print(f"  Failed to load: {len(failed_files)} files")
    
    if failed_files:
        print("Failed files:")
        for file_path in failed_files:
            print(f"  - {file_path}")
    
    return docs, failed_files

def chunk_documents(docs):
    """Split documents into chunks with metadata preservation."""
    print(f"Chunking {len(docs)} documents...")
    
    splitter = CharacterTextSplitter(
        chunk_size=CONFIG["chunk_size"], 
        chunk_overlap=CONFIG["chunk_overlap"]
    )
    
    chunks = splitter.split_documents(docs)
    
    # Add chunk-specific metadata
    for i, chunk in enumerate(chunks):
        chunk.metadata["chunk_id"] = i
        chunk.metadata["chunk_size"] = len(chunk.page_content)
        chunk.metadata["chunk_timestamp"] = datetime.now().isoformat()
    
    print(f"Created {len(chunks)} chunks")
    
    # Chunk statistics
    chunk_sizes = [len(chunk.page_content) for chunk in chunks]
    avg_chunk_size = sum(chunk_sizes) / len(chunk_sizes)
    
    print(f"Average chunk size: {avg_chunk_size:.0f} characters")
    print(f"Min chunk size: {min(chunk_sizes)} characters")
    print(f"Max chunk size: {max(chunk_sizes)} characters")
    
    return chunks


In [5]:
# Embedding and Persistence Functions
def check_existing_embeddings():
    """Check if embeddings already exist."""
    faiss_exists = os.path.exists(CONFIG["faiss_index_dir"])
    metadata_exists = os.path.exists(CONFIG["metadata_file"])
    
    if faiss_exists and metadata_exists:
        try:
            with open(CONFIG["metadata_file"], 'r') as f:
                metadata = json.load(f)
            print("Found existing embeddings:")
            print(f"  Created: {metadata.get('created_at', 'Unknown')}")
            print(f"  Documents: {metadata.get('num_documents', 'Unknown')}")
            print(f"  Chunks: {metadata.get('num_chunks', 'Unknown')}")
            print(f"  Model: {metadata.get('embedding_model', 'Unknown')}")
            return True, metadata
        except Exception as e:
            print(f"Error reading metadata: {e}")
            return False, None
    else:
        print("No existing embeddings found.")
        return False, None

def create_embeddings(chunks):
    """Create embeddings and save them."""
    print(f"Creating embeddings for {len(chunks)} chunks...")
    print(f"Using model: {CONFIG['embedding_model']}")
    
    # Initialize embeddings
    embeddings = OpenAIEmbeddings(model=CONFIG["embedding_model"])
    
    # Create vector store
    print("Building FAISS vector store...")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    
    # Save vector store
    print(f"Saving embeddings to {CONFIG['faiss_index_dir']}...")
    vectorstore.save_local(CONFIG["faiss_index_dir"])
    
    # Create and save metadata
    metadata = {
        "created_at": datetime.now().isoformat(),
        "embedding_model": CONFIG["embedding_model"],
        "chunk_size": CONFIG["chunk_size"],
        "chunk_overlap": CONFIG["chunk_overlap"],
        "num_documents": len(set(chunk.metadata["source_file"] for chunk in chunks)),
        "num_chunks": len(chunks),
        "source_directory": CONFIG["source_dir"],
        "subdirectories": list(set(chunk.metadata["subdirectory"] for chunk in chunks)),
        "total_characters": sum(len(chunk.page_content) for chunk in chunks)
    }
    
    with open(CONFIG["metadata_file"], 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("Embeddings created and saved successfully!")
    return vectorstore, metadata

def load_existing_embeddings():
    """Load existing embeddings."""
    print("Loading existing embeddings...")
    embeddings = OpenAIEmbeddings(model=CONFIG["embedding_model"])
    vectorstore = FAISS.load_local(CONFIG["faiss_index_dir"], embeddings)
    
    with open(CONFIG["metadata_file"], 'r') as f:
        metadata = json.load(f)
    
    print("Existing embeddings loaded successfully!")
    return vectorstore, metadata


In [6]:
# Main Execution Pipeline
print("=" * 60)
print("AMATOL OCR EMBEDDING PIPELINE")
print("=" * 60)

# Check for existing embeddings
has_existing, existing_metadata = check_existing_embeddings()

if has_existing:
    choice = input("\nExisting embeddings found. Do you want to:\n1. Load existing embeddings\n2. Recreate embeddings (will overwrite)\nEnter choice (1 or 2): ").strip()
    
    if choice == "1":
        vectorstore, metadata = load_existing_embeddings()
        print("\n" + "=" * 60)
        print("EMBEDDINGS LOADED SUCCESSFULLY")
        print("=" * 60)
        print(f"Total chunks available: {metadata['num_chunks']}")
        print(f"Total documents: {metadata['num_documents']}")
    else:
        print("\nProceeding with recreation...")
        # Load and process documents
        docs, failed_files = load_documents(txt_files)
        chunks = chunk_documents(docs)
        vectorstore, metadata = create_embeddings(chunks)
        
        print("\n" + "=" * 60)
        print("EMBEDDINGS CREATED SUCCESSFULLY")
        print("=" * 60)
else:
    print("\nNo existing embeddings found. Creating new embeddings...")
    # Load and process documents
    docs, failed_files = load_documents(txt_files)
    chunks = chunk_documents(docs)
    vectorstore, metadata = create_embeddings(chunks)
    
    print("\n" + "=" * 60)
    print("EMBEDDINGS CREATED SUCCESSFULLY")
    print("=" * 60)

# Display final statistics
print(f"\nFinal Statistics:")
print(f"  Documents processed: {metadata['num_documents']}")
print(f"  Total chunks: {metadata['num_chunks']}")
print(f"  Total characters: {metadata['total_characters']:,}")
print(f"  Average characters per chunk: {metadata['total_characters'] // metadata['num_chunks']:,}")
print(f"  Subdirectories: {len(metadata['subdirectories'])}")
print(f"  Embedding model: {metadata['embedding_model']}")
print(f"  Storage location: {CONFIG['faiss_index_dir']}")

print(f"\n✅ Embeddings are ready for use!")


Created a chunk of size 614, which is longer than the specified 500
Created a chunk of size 773, which is longer than the specified 500
Created a chunk of size 920, which is longer than the specified 500
Created a chunk of size 559, which is longer than the specified 500
Created a chunk of size 590, which is longer than the specified 500
Created a chunk of size 544, which is longer than the specified 500
Created a chunk of size 896, which is longer than the specified 500
Created a chunk of size 1256, which is longer than the specified 500
Created a chunk of size 510, which is longer than the specified 500
Created a chunk of size 642, which is longer than the specified 500


AMATOL OCR EMBEDDING PIPELINE
No existing embeddings found.

No existing embeddings found. Creating new embeddings...

Loading complete:
  Successfully loaded: 1 documents
  Failed to load: 0 files
Chunking 1 documents...
Created 19 chunks
Average chunk size: 492 characters
Min chunk size: 7 characters
Max chunk size: 1256 characters
Creating embeddings for 19 chunks...
Using model: text-embedding-3-small


  embeddings = OpenAIEmbeddings(model=CONFIG["embedding_model"])


Building FAISS vector store...
Saving embeddings to embeddings2/faiss_index...
Embeddings created and saved successfully!

EMBEDDINGS CREATED SUCCESSFULLY

Final Statistics:
  Documents processed: 1
  Total chunks: 19
  Total characters: 9,346
  Average characters per chunk: 491
  Subdirectories: 1
  Embedding model: text-embedding-3-small
  Storage location: embeddings2/faiss_index

✅ Embeddings are ready for use!


In [7]:
# Test the Embeddings with Similarity Search
def test_similarity_search(vectorstore, query, k=5):
    """Test the vector store with a sample query."""
    print(f"\n🔍 Testing similarity search with query: '{query}'")
    print("-" * 50)
    
    # Perform similarity search
    docs = vectorstore.similarity_search(query, k=k)
    
    print(f"Found {len(docs)} relevant chunks:")
    print()
    
    for i, doc in enumerate(docs, 1):
        print(f"Result {i}:")
        print(f"  Source: {doc.metadata.get('source_file', 'Unknown')}")
        print(f"  Subdirectory: {doc.metadata.get('subdirectory', 'Unknown')}")
        print(f"  Chunk ID: {doc.metadata.get('chunk_id', 'Unknown')}")
        print(f"  Content preview: {doc.page_content[:200]}...")
        print()
    
    return docs

# Example test queries - uncomment to run
test_similarity_search(vectorstore, "Amatol plant production", k=3)
test_similarity_search(vectorstore, "explosion accident", k=3) 
test_similarity_search(vectorstore, "newspaper article", k=3)

print("🧪 Test functions ready. Uncomment the test queries above to run similarity searches.")



🔍 Testing similarity search with query: 'Amatol plant production'
--------------------------------------------------
Found 3 relevant chunks:

Result 1:
  Source: amatol_sojourn_article.txt
  Subdirectory: 
  Chunk ID: 10
  Content preview: The town reached a peak population of 7,000, had the capacity to house over 10,000, and was planned to accommodate a possible population of 25,000. 

Amatol Plant...

Result 2:
  Source: amatol_sojourn_article.txt
  Subdirectory: 
  Chunk ID: 11
  Content preview: Loading operations at the Amatol plant began on July 31, 1918, and on August 3, the first shell was loaded. The plant was capable of loading “60,000 shells of all sizes, 50,000 boosters, 50,000 hand g...

Result 3:
  Source: amatol_sojourn_article.txt
  Subdirectory: 
  Chunk ID: 8
  Content preview: Development was rapid. By June, the population was nearly 2,000, and train service was added. By August, advertisements for Amatol described it as “a city with all modern improvements with el

In [8]:
# Utility Functions for Working with Embeddings

def get_embedding_info():
    """Display detailed information about the current embeddings."""
    if os.path.exists(CONFIG["metadata_file"]):
        with open(CONFIG["metadata_file"], 'r') as f:
            metadata = json.load(f)
        
        print("📊 EMBEDDING INFORMATION")
        print("=" * 50)
        print(f"Created: {metadata['created_at']}")
        print(f"Model: {metadata['embedding_model']}")
        print(f"Source directory: {metadata['source_directory']}")
        print(f"Documents: {metadata['num_documents']}")
        print(f"Chunks: {metadata['num_chunks']}")
        print(f"Total characters: {metadata['total_characters']:,}")
        print(f"Chunk size: {metadata['chunk_size']}")
        print(f"Chunk overlap: {metadata['chunk_overlap']}")
        print(f"Subdirectories: {', '.join(metadata['subdirectories'])}")
        
        return metadata
    else:
        print("❌ No embedding metadata found.")
        return None

def delete_embeddings():
    """Delete existing embeddings and metadata."""
    import shutil
    
    confirm = input("⚠️  Are you sure you want to delete all embeddings? (yes/no): ").strip().lower()
    
    if confirm == "yes":
        if os.path.exists(CONFIG["faiss_index_dir"]):
            shutil.rmtree(CONFIG["faiss_index_dir"])
            print(f"🗑️  Deleted {CONFIG['faiss_index_dir']}")
        
        if os.path.exists(CONFIG["metadata_file"]):
            os.remove(CONFIG["metadata_file"])
            print(f"🗑️  Deleted {CONFIG['metadata_file']}")
        
        print("✅ All embeddings deleted.")
    else:
        print("❌ Deletion cancelled.")

def search_by_source(vectorstore, source_filename, k=10):
    """Find all chunks from a specific source file."""
    print(f"🔍 Searching for chunks from: {source_filename}")
    
    # Get all documents and filter by source
    # Note: This is a simple approach. For large datasets, you might want to use metadata filtering
    all_docs = vectorstore.similarity_search("", k=1000)  # Get many docs
    
    matching_docs = [doc for doc in all_docs if doc.metadata.get('source_file') == source_filename]
    
    print(f"Found {len(matching_docs)} chunks from {source_filename}")
    
    for i, doc in enumerate(matching_docs[:k], 1):
        print(f"\nChunk {i}:")
        print(f"  Chunk ID: {doc.metadata.get('chunk_id', 'Unknown')}")
        print(f"  Size: {doc.metadata.get('chunk_size', 'Unknown')} chars")
        print(f"  Preview: {doc.page_content[:150]}...")
    
    return matching_docs[:k]

# Display utility info
print("🛠️  UTILITY FUNCTIONS LOADED")
print("Available functions:")
print("  - get_embedding_info(): Display embedding statistics")
print("  - delete_embeddings(): Delete all embeddings (with confirmation)")
print("  - search_by_source(vectorstore, 'filename.txt'): Find chunks from specific file")
print("  - test_similarity_search(vectorstore, 'query'): Test semantic search")


🛠️  UTILITY FUNCTIONS LOADED
Available functions:
  - get_embedding_info(): Display embedding statistics
  - delete_embeddings(): Delete all embeddings (with confirmation)
  - search_by_source(vectorstore, 'filename.txt'): Find chunks from specific file
  - test_similarity_search(vectorstore, 'query'): Test semantic search
