# Test Local Embedding

Notebook that tests the local LMStudio embedding model accessibility and functionality.

Evalates the API performance for different parameter and batching configurations.

In [None]:
import requests

# LM Studio server endpoint
HOST = "192.168.X.Y"  # Replace with your server's IP address
LM_STUDIO_URL = f"http://{HOST}:1234"

# Get the list of models
response = requests.get(f"{LM_STUDIO_URL}/v1/models")
response.raise_for_status()

models = response.json()

print("Available models:")
for model in models.get("data", []):
    print(f"  - {model['id']}")

In [None]:
import time
import concurrent.futures
from typing import List

# Embedding model to test (note: corrected typo from "nomix" to "nomic")
EMBEDDING_MODEL = "text-embedding-nomic-embed-text-v1.5@f16"

def get_embeddings(texts: List[str], model: str = EMBEDDING_MODEL) -> dict:
    """Get embeddings for a list of texts using the batch API."""
    response = requests.post(
        f"{LM_STUDIO_URL}/v1/embeddings",
        json={
            "model": model,
            "input": texts  # LM Studio supports batched input
        }
    )
    response.raise_for_status()
    return response.json()

def get_embedding_single(text: str, model: str = EMBEDDING_MODEL) -> dict:
    """Get embedding for a single text."""
    response = requests.post(
        f"{LM_STUDIO_URL}/v1/embeddings",
        json={
            "model": model,
            "input": text
        }
    )
    response.raise_for_status()
    return response.json()

# Sample texts for testing
test_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is a subset of artificial intelligence.",
    "Python is a popular programming language for data science.",
    "LM Studio allows running local language models.",
    "Embeddings convert text into numerical vectors.",
    "Vector databases store and search embeddings efficiently.",
    "Neural networks learn patterns from training data.",
    "Natural language processing enables computers to understand text.",
    "Transformers revolutionized the field of NLP.",
    "Local AI provides privacy and control over your data."
]

print(f"Testing embedding model: {EMBEDDING_MODEL}")
print(f"Number of test texts: {len(test_texts)}")
print("-" * 50)

# Test 1: Single requests (sequential)
print("\n1. Sequential single requests:")
start_time = time.perf_counter()
for text in test_texts:
    result = get_embedding_single(text)
sequential_time = time.perf_counter() - start_time
print(f"   Total time: {sequential_time:.3f}s")
print(f"   Avg per text: {sequential_time/len(test_texts)*1000:.1f}ms")

# Test 2: Batched request (all texts at once)
print("\n2. Batched request (all texts at once):")
start_time = time.perf_counter()
result = get_embeddings(test_texts)
batch_time = time.perf_counter() - start_time
print(f"   Total time: {batch_time:.3f}s")
print(f"   Avg per text: {batch_time/len(test_texts)*1000:.1f}ms")
print(f"   Embedding dimension: {len(result['data'][0]['embedding'])}")

# Test 3: Parallel requests using ThreadPoolExecutor
print("\n3. Parallel single requests (4 workers):")
start_time = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(get_embedding_single, text) for text in test_texts]
    results = [f.result() for f in concurrent.futures.as_completed(futures)]
parallel_time = time.perf_counter() - start_time
print(f"   Total time: {parallel_time:.3f}s")
print(f"   Avg per text: {parallel_time/len(test_texts)*1000:.1f}ms")

# Test 4: Larger batch performance test
print("\n4. Throughput test (100 texts in batches of 10):")
large_test = test_texts * 10  # 100 texts
batch_size = 10
start_time = time.perf_counter()
for i in range(0, len(large_test), batch_size):
    batch = large_test[i:i+batch_size]
    get_embeddings(batch)
large_batch_time = time.perf_counter() - start_time
print(f"   Total time: {large_batch_time:.3f}s")
print(f"   Throughput: {len(large_test)/large_batch_time:.1f} texts/second")

# Summary
print("\n" + "=" * 50)
print("PERFORMANCE SUMMARY")
print("=" * 50)
print(f"Sequential:  {sequential_time:.3f}s (baseline)")
print(f"Batched:     {batch_time:.3f}s ({sequential_time/batch_time:.1f}x faster)")
print(f"Parallel:    {parallel_time:.3f}s ({sequential_time/parallel_time:.1f}x faster)")
print(f"\nRecommendation: Use batched requests for best performance!")

In [None]:
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Iterator, Tuple

@dataclass
class EmbeddingResult:
    """Result of an embedding operation."""
    text: str
    embedding: List[float]
    index: int
    
    @property
    def vector(self) -> np.ndarray:
        """Return embedding as numpy array."""
        return np.array(self.embedding, dtype=np.float32)

class EmbeddingClient:
    """
    Client for generating text embeddings using LM Studio's API.
    Optimized for batch processing to populate vector databases.
    """
    
    def __init__(
        self, 
        base_url: str = "http://localhost:1234",
        model: str = "text-embedding-nomic-embed-text-v1.5@f16",
        batch_size: int = 32,
        timeout: int = 60,
        # Model configuration limits
        context_length: int = 2048,
        model_batch_size: int = 1024
    ):
        """
        Initialize the embedding client.
        
        Args:
            base_url: LM Studio server URL
            model: Embedding model identifier
            batch_size: Number of texts to process per API call
            timeout: Request timeout in seconds
            context_length: Maximum context length in tokens (model config)
            model_batch_size: Maximum batch size in tokens (model config)
        """
        self.base_url = base_url.rstrip('/')
        self.model = model
        self.batch_size = batch_size
        self.timeout = timeout
        self.context_length = context_length
        self.model_batch_size = model_batch_size
        self._dimension: Optional[int] = None
    
    @property
    def dimension(self) -> int:
        """Get the embedding dimension (lazy-loaded)."""
        if self._dimension is None:
            # Get dimension by embedding a test string
            result = self._embed_batch(["test"])
            self._dimension = len(result[0])
        return self._dimension
    
    def estimate_tokens(self, text: str) -> int:
        """
        Estimate token count for a text (rough approximation).
        Uses ~4 chars per token as a conservative estimate.
        """
        return len(text) // 4 + 1
    
    def max_safe_text_length(self, num_texts: int = 1) -> int:
        """
        Calculate maximum safe text length in characters for a batch.
        
        Args:
            num_texts: Number of texts in the batch
            
        Returns:
            Maximum characters per text to stay within limits
        """
        # Use the more restrictive limit
        tokens_per_text = min(
            self.context_length // num_texts,
            self.model_batch_size // num_texts
        )
        # Conservative: 3 chars per token to leave headroom
        return max(tokens_per_text * 3, 50)
    
    def _embed_batch(self, texts: List[str]) -> List[List[float]]:
        """Internal method to embed a batch of texts."""
        response = requests.post(
            f"{self.base_url}/v1/embeddings",
            json={"model": self.model, "input": texts},
            timeout=self.timeout
        )
        response.raise_for_status()
        data = response.json()
        # Sort by index to maintain order
        sorted_data = sorted(data["data"], key=lambda x: x["index"])
        return [item["embedding"] for item in sorted_data]
    
    def embed(self, text: str) -> List[float]:
        """
        Embed a single text string.
        
        Args:
            text: Text to embed
            
        Returns:
            Embedding vector as list of floats
        """
        return self._embed_batch([text])[0]
    
    def embed_many(
        self, 
        texts: List[str], 
        show_progress: bool = True
    ) -> List[EmbeddingResult]:
        """
        Embed multiple texts with automatic batching.
        
        Args:
            texts: List of texts to embed
            show_progress: Whether to print progress updates
            
        Returns:
            List of EmbeddingResult objects
        """
        results = []
        total_batches = (len(texts) + self.batch_size - 1) // self.batch_size
        
        for batch_idx, i in enumerate(range(0, len(texts), self.batch_size)):
            batch_texts = texts[i:i + self.batch_size]
            embeddings = self._embed_batch(batch_texts)
            
            for j, (text, embedding) in enumerate(zip(batch_texts, embeddings)):
                results.append(EmbeddingResult(
                    text=text,
                    embedding=embedding,
                    index=i + j
                ))
            
            if show_progress:
                print(f"\rProcessed batch {batch_idx + 1}/{total_batches} "
                      f"({len(results)}/{len(texts)} texts)", end="")
        
        if show_progress:
            print()  # New line after progress
            
        return results
    
    def embed_iter(
        self, 
        texts: List[str]
    ) -> Iterator[Tuple[int, str, List[float]]]:
        """
        Generator that yields embeddings as they're computed.
        Memory-efficient for large datasets.
        
        Args:
            texts: List of texts to embed
            
        Yields:
            Tuples of (index, text, embedding)
        """
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]
            embeddings = self._embed_batch(batch_texts)
            
            for j, (text, embedding) in enumerate(zip(batch_texts, embeddings)):
                yield (i + j, text, embedding)
    
    def embed_to_numpy(self, texts: List[str], show_progress: bool = True) -> np.ndarray:
        """
        Embed texts and return as a numpy array.
        Ideal for vector database ingestion.
        
        Args:
            texts: List of texts to embed
            show_progress: Whether to print progress updates
            
        Returns:
            numpy array of shape (n_texts, embedding_dimension)
        """
        results = self.embed_many(texts, show_progress=show_progress)
        return np.array([r.embedding for r in results], dtype=np.float32)
    
    def similarity(self, text1: str, text2: str) -> float:
        """
        Calculate cosine similarity between two texts.
        
        Args:
            text1: First text
            text2: Second text
            
        Returns:
            Cosine similarity score (0-1)
        """
        embeddings = self._embed_batch([text1, text2])
        v1 = np.array(embeddings[0])
        v2 = np.array(embeddings[1])
        return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))


# Create a default client instance with model limits
client = EmbeddingClient(
    base_url=LM_STUDIO_URL,
    model=EMBEDDING_MODEL,
    context_length=2048,
    model_batch_size=1024
)

print(f"EmbeddingClient initialized")
print(f"  Server: {client.base_url}")
print(f"  Model: {client.model}")
print(f"  Batch size: {client.batch_size}")
print(f"  Context length: {client.context_length} tokens")
print(f"  Model batch size: {client.model_batch_size} tokens")
print(f"  Embedding dimension: {client.dimension}")

In [None]:
# Example usage of the EmbeddingClient API

# Sample documents (simulating content for a vector database)
documents = [
    "Python is a high-level programming language known for its readability.",
    "Machine learning models can learn patterns from large datasets.",
    "Vector databases store embeddings for semantic search applications.",
    "LM Studio provides a local server for running language models.",
    "Neural networks are inspired by biological brain structures.",
    "Deep learning has revolutionized computer vision and NLP.",
    "Embeddings represent text as dense numerical vectors.",
    "Transformers use attention mechanisms to process sequences.",
    "RAG combines retrieval with generative AI for better responses.",
    "Local AI solutions offer privacy and data security benefits.",
]

# 1. Embed all documents at once (returns EmbeddingResult objects)
print("1. Embedding documents...")
results = client.embed_many(documents)
print(f"   Generated {len(results)} embeddings\n")

# 2. Get embeddings as numpy array (ready for vector DB)
print("2. Getting embeddings as numpy array...")
embeddings_matrix = client.embed_to_numpy(documents)
print(f"   Shape: {embeddings_matrix.shape}\n")

# 3. Calculate similarity between texts
print("3. Semantic similarity examples:")
pairs = [
    ("Python is great for AI", "Machine learning uses Python often"),
    ("Python is great for AI", "The weather is sunny today"),
    ("Vector databases", "Embedding storage systems"),
]
for text1, text2 in pairs:
    sim = client.similarity(text1, text2)
    print(f"   '{text1[:30]}...' vs '{text2[:30]}...': {sim:.3f}")

# 4. Find most similar document to a query
print("\n4. Semantic search example:")
query = "How do neural networks learn?"
query_embedding = np.array(client.embed(query))

# Calculate similarities
similarities = []
for i, doc in enumerate(documents):
    doc_embedding = embeddings_matrix[i]
    sim = np.dot(query_embedding, doc_embedding) / (
        np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
    )
    similarities.append((sim, doc))

# Sort by similarity
similarities.sort(reverse=True)
print(f"   Query: '{query}'")
print(f"   Top 3 matches:")
for sim, doc in similarities[:3]:
    print(f"   {sim:.3f}: {doc[:60]}...")

In [None]:
# Performance test pushing model limits
# Context length: 2048 tokens, Batch size: 1024 tokens

import random
import string

def generate_text(target_tokens: int) -> str:
    """Generate random text with approximately target_tokens tokens."""
    # Conservative estimate: ~4 chars per token
    # Use ~3.5 chars to leave safety margin
    char_count = int(target_tokens * 3.5)
    words = []
    while len(' '.join(words)) < char_count:
        word_len = random.randint(3, 10)
        word = ''.join(random.choices(string.ascii_lowercase, k=word_len))
        words.append(word)
    return ' '.join(words)[:char_count]

def generate_long_text(target_chars: int) -> str:
    """Generate random text with specific character count for long texts."""
    words = []
    while len(' '.join(words)) < target_chars:
        word_len = random.randint(3, 12)
        word = ''.join(random.choices(string.ascii_lowercase, k=word_len))
        words.append(word)
    return ' '.join(words)[:target_chars]

print("=" * 70)
print("PERFORMANCE TEST: Pushing Model Limits")
print("=" * 70)
print(f"Context length limit: {client.context_length} tokens")
print(f"Batch size limit: {client.model_batch_size} tokens")
print()

# Test configurations
# Each test varies: (batch_count, tokens_per_text, description)
test_configs = [
    # Many short texts (maximize parallelism)
    (64, 15, "64 short texts (~15 tokens each)"),
    (128, 7, "128 very short texts (~7 tokens each)"),
    
    # Fewer longer texts (test context handling)
    (16, 60, "16 medium texts (~60 tokens each)"),
    (8, 120, "8 longer texts (~120 tokens each)"),
    
    # Push toward limits (conservative to avoid errors)
    (32, 30, "32 texts √ó 30 tokens (~960 total)"),
    (20, 50, "20 texts √ó 50 tokens (~1000 total)"),
    
    # Single long text (max context utilization)
    (1, 500, "1 long text (~500 tokens)"),
    (2, 400, "2 long texts (~400 tokens each)"),
    
    # Near context limit - single texts
    (1, 1500, "1 text at ~1500 tokens (near limit)"),
    (1, 1800, "1 text at ~1800 tokens (close to 2048)"),
]

results_summary = []

for batch_count, tokens_per_text, description in test_configs:
    # Generate test texts
    texts = [generate_text(tokens_per_text) for _ in range(batch_count)]
    
    # Estimate total tokens
    total_chars = sum(len(t) for t in texts)
    est_tokens = total_chars // 4
    
    # Run the embedding
    try:
        start_time = time.perf_counter()
        embeddings = client._embed_batch(texts)
        elapsed = time.perf_counter() - start_time
        
        throughput = batch_count / elapsed
        tokens_per_sec = est_tokens / elapsed
        
        results_summary.append({
            'description': description,
            'batch_count': batch_count,
            'tokens_per_text': tokens_per_text,
            'est_total_tokens': est_tokens,
            'elapsed': elapsed,
            'throughput': throughput,
            'tokens_per_sec': tokens_per_sec,
            'status': 'OK'
        })
        
        print(f"‚úì {description}")
        print(f"  Chars: {total_chars:,} | Est. tokens: {est_tokens:,}")
        print(f"  Time: {elapsed*1000:.1f}ms | {throughput:.1f} texts/s | {tokens_per_sec:.0f} tokens/s")
        print()
        
    except Exception as e:
        results_summary.append({
            'description': description,
            'status': f'FAILED: {str(e)[:50]}'
        })
        print(f"‚úó {description}")
        print(f"  ERROR: {e}")
        print()

# =============================================================================
# LONG TEXT TESTS (10K-20K characters)
# =============================================================================
print("\n" + "=" * 70)
print("LONG TEXT TESTS: 10K-20K character texts")
print("=" * 70)
print("Note: These texts will likely be truncated to context_length by the model")
print()

long_text_configs = [
    (10000, "10K chars (~2,500 tokens)"),
    (12000, "12K chars (~3,000 tokens)"),
    (15000, "15K chars (~3,750 tokens)"),
    (18000, "18K chars (~4,500 tokens)"),
    (20000, "20K chars (~5,000 tokens)"),
]

for char_count, description in long_text_configs:
    # Generate single long text
    text = generate_long_text(char_count)
    actual_chars = len(text)
    est_tokens = actual_chars // 4
    
    try:
        start_time = time.perf_counter()
        embeddings = client._embed_batch([text])
        elapsed = time.perf_counter() - start_time
        
        tokens_per_sec = est_tokens / elapsed
        
        results_summary.append({
            'description': description,
            'batch_count': 1,
            'tokens_per_text': est_tokens,
            'est_total_tokens': est_tokens,
            'elapsed': elapsed,
            'throughput': 1 / elapsed,
            'tokens_per_sec': tokens_per_sec,
            'status': 'OK',
            'long_text': True
        })
        
        print(f"‚úì {description}")
        print(f"  Actual chars: {actual_chars:,} | Est. tokens: {est_tokens:,}")
        print(f"  Time: {elapsed*1000:.1f}ms | {tokens_per_sec:.0f} tokens/s")
        print(f"  Embedding dim: {len(embeddings[0])}")
        print()
        
    except Exception as e:
        error_msg = str(e)
        results_summary.append({
            'description': description,
            'status': f'FAILED',
            'error': error_msg[:100]
        })
        print(f"‚úó {description}")
        print(f"  ERROR: {error_msg[:200]}")
        print()

# Summary table
print("\n" + "=" * 70)
print("RESULTS SUMMARY")
print("=" * 70)
print(f"{'Config':<45} {'Tokens':<8} {'Time':<10} {'Tokens/s':<10}")
print("-" * 70)

for r in results_summary:
    if r.get('status') == 'OK':
        marker = "üìÑ" if r.get('long_text') else "  "
        print(f"{marker}{r['description']:<43} {r['est_total_tokens']:<8} {r['elapsed']*1000:>6.1f}ms {r['tokens_per_sec']:>8.0f}")
    else:
        print(f"‚ùå{r['description']:<43} {r.get('status', 'FAILED')}")

# Find optimal configuration
successful = [r for r in results_summary if r.get('status') == 'OK']
if successful:
    best = max(successful, key=lambda x: x['tokens_per_sec'])
    print()
    print(f"üèÜ Best throughput: {best['description']}")
    print(f"   {best['tokens_per_sec']:.0f} tokens/second")
    
    # Also show best for short vs long texts
    short_texts = [r for r in successful if not r.get('long_text')]
    long_texts = [r for r in successful if r.get('long_text')]
    
    if short_texts:
        best_short = max(short_texts, key=lambda x: x['tokens_per_sec'])
        print(f"\nüìä Best for batched short texts: {best_short['description']}")
        print(f"   {best_short['tokens_per_sec']:.0f} tokens/second")
    
    if long_texts:
        best_long = max(long_texts, key=lambda x: x['tokens_per_sec'])
        print(f"\nüìÑ Best for long texts: {best_long['description']}")
        print(f"   {best_long['tokens_per_sec']:.0f} tokens/second")

In [None]:
# =============================================================================
# WIKIPEDIA EMBEDDING ESTIMATION
# =============================================================================
# Estimate time to embed 7M Wikipedia articles with multiple paragraphs each

# Wikipedia dataset parameters
WIKIPEDIA_ARTICLES = 7_000_000
AVG_PARAGRAPHS_PER_ARTICLE = 5  # Typical article has ~5 paragraphs
AVG_SENTENCES_PER_PARAGRAPH = 4  # ~4 sentences per paragraph
AVG_CHARS_PER_SENTENCE = 120    # Average sentence length

# Calculate total fragments
TOTAL_PARAGRAPHS = WIKIPEDIA_ARTICLES * AVG_PARAGRAPHS_PER_ARTICLE
TOTAL_SENTENCES = TOTAL_PARAGRAPHS * AVG_SENTENCES_PER_PARAGRAPH

def format_duration(seconds: float) -> str:
    """Format seconds into human-readable duration."""
    if seconds < 60:
        return f"{seconds:.1f} seconds"
    elif seconds < 3600:
        return f"{seconds/60:.1f} minutes"
    elif seconds < 86400:
        return f"{seconds/3600:.1f} hours"
    else:
        days = seconds / 86400
        return f"{days:.1f} days"

print("=" * 70)
print("WIKIPEDIA EMBEDDING TIME ESTIMATION")
print("=" * 70)
print(f"\nDataset assumptions:")
print(f"  Articles:              {WIKIPEDIA_ARTICLES:,}")
print(f"  Paragraphs/article:    {AVG_PARAGRAPHS_PER_ARTICLE}")
print(f"  Sentences/paragraph:   {AVG_SENTENCES_PER_PARAGRAPH}")
print(f"  Chars/sentence:        {AVG_CHARS_PER_SENTENCE}")
print(f"\nTotal fragments to embed:")
print(f"  Paragraphs:            {TOTAL_PARAGRAPHS:,}")
print(f"  Sentences:             {TOTAL_SENTENCES:,}")

# Test different fragment sizes to find optimal configuration
fragment_configs = [
    # (chars_per_fragment, fragments_per_batch, description)
    (120, 64, "Sentence-level (~120 chars, 64/batch)"),
    (120, 128, "Sentence-level (~120 chars, 128/batch)"),
    (500, 16, "Paragraph-level (~500 chars, 16/batch)"),
    (500, 32, "Paragraph-level (~500 chars, 32/batch)"),
    (1000, 8, "Long paragraph (~1000 chars, 8/batch)"),
    (2000, 4, "Multi-paragraph (~2000 chars, 4/batch)"),
]

print("\n" + "-" * 70)
print("FRAGMENT THROUGHPUT BENCHMARKS")
print("-" * 70)

fragment_results = []

for chars_per_frag, frags_per_batch, description in fragment_configs:
    # Generate test fragments
    fragments = [generate_long_text(chars_per_frag) for _ in range(frags_per_batch)]
    
    # Run multiple iterations for stable measurement
    iterations = 5
    times = []
    
    for _ in range(iterations):
        start = time.perf_counter()
        client._embed_batch(fragments)
        times.append(time.perf_counter() - start)
    
    avg_time = sum(times) / len(times)
    frags_per_sec = frags_per_batch / avg_time
    
    # Estimate Wikipedia processing time
    if chars_per_frag <= 200:  # Sentence-level
        total_frags = TOTAL_SENTENCES
        frag_type = "sentences"
    else:  # Paragraph-level
        total_frags = TOTAL_PARAGRAPHS
        frag_type = "paragraphs"
    
    est_seconds = total_frags / frags_per_sec
    
    fragment_results.append({
        'description': description,
        'chars_per_frag': chars_per_frag,
        'frags_per_batch': frags_per_batch,
        'frags_per_sec': frags_per_sec,
        'frag_type': frag_type,
        'total_frags': total_frags,
        'est_seconds': est_seconds
    })
    
    print(f"\n‚úì {description}")
    print(f"  Batch time: {avg_time*1000:.1f}ms | {frags_per_sec:.1f} fragments/sec")
    print(f"  Wikipedia ({total_frags:,} {frag_type}): {format_duration(est_seconds)}")

# Summary and recommendations
print("\n" + "=" * 70)
print("WIKIPEDIA PROCESSING TIME ESTIMATES")
print("=" * 70)
print(f"\n{'Configuration':<45} {'Frags/s':<12} {'Est. Time':<15}")
print("-" * 70)

for r in fragment_results:
    print(f"{r['description']:<45} {r['frags_per_sec']:>8.1f}    {format_duration(r['est_seconds']):<15}")

# Find best configuration
best_sentence = min([r for r in fragment_results if r['frag_type'] == 'sentences'], 
                     key=lambda x: x['est_seconds'])
best_paragraph = min([r for r in fragment_results if r['frag_type'] == 'paragraphs'], 
                      key=lambda x: x['est_seconds'])

print("\n" + "=" * 70)
print("RECOMMENDATIONS FOR 7M WIKIPEDIA ARTICLES")
print("=" * 70)

print(f"\nüìù SENTENCE-LEVEL EMBEDDINGS ({TOTAL_SENTENCES:,} sentences)")
print(f"   Best config: {best_sentence['description']}")
print(f"   Throughput:  {best_sentence['frags_per_sec']:.1f} sentences/second")
print(f"   Est. time:   {format_duration(best_sentence['est_seconds'])}")

print(f"\nüìÑ PARAGRAPH-LEVEL EMBEDDINGS ({TOTAL_PARAGRAPHS:,} paragraphs)")
print(f"   Best config: {best_paragraph['description']}")
print(f"   Throughput:  {best_paragraph['frags_per_sec']:.1f} paragraphs/second")
print(f"   Est. time:   {format_duration(best_paragraph['est_seconds'])}")

# Storage estimate
embedding_dim = client.dimension
bytes_per_embedding = embedding_dim * 4  # float32

print(f"\nüíæ STORAGE ESTIMATES (768-dim float32 embeddings)")
print(f"   Per embedding: {bytes_per_embedding:,} bytes")
print(f"   Sentence-level: {TOTAL_SENTENCES * bytes_per_embedding / 1e12:.2f} TB")
print(f"   Paragraph-level: {TOTAL_PARAGRAPHS * bytes_per_embedding / 1e9:.1f} GB")