In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box, info_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, latency_distribution, throughput_comparison, memory_comparison, COLORS

display_header('Modern Embedded Vector Stores', 'SynaDB vs Qdrant vs LanceDB')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('Benchmark: Batch vs Single Insertion', 'benchmark-insertion'),
    ('Benchmark: Search at Various k', 'benchmark-search'),
    ('Distance Metrics Comparison', 'distance-metrics'),
    ('Update/Delete Operations', 'update-delete'),
    ('Storage Format Comparison', 'storage'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## 📌 Introduction <a id="introduction"></a>

This notebook compares **SynaDB** against two modern embedded vector databases:

| System | Storage | Key Features |
|--------|---------|-------------|
| **SynaDB** | Single file | AI-native, HNSW, experiment tracking |
| **Qdrant** | Directory | Rust-based, filtering, local mode |
| **LanceDB** | Directory (Lance) | Columnar, versioning, DuckDB integration |

### The Modern Embedded Landscape

All three systems support embedded (local) mode, making them ideal for:
- Development and prototyping
- Edge deployments
- Single-node production
- Offline applications

### Test Configuration

- **Dataset**: 100,000 synthetic embeddings
- **Dimensions**: 768
- **k values**: 1, 10, 100
- **Distance metrics**: Cosine, Euclidean, Dot Product

In [None]:
# Cell 4: System Info
display_system_info()

## 🔧 Setup <a id="setup"></a>

Setting up all three embedded vector stores.

In [None]:
# Cell 6: Check Dependencies and Imports
import numpy as np
import time
import os
import shutil
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt

# Check for SynaDB
HAS_SYNADB = check_dependency('synadb', 'pip install synadb')

# Check for Qdrant
HAS_QDRANT = check_dependency('qdrant_client', 'pip install qdrant-client')

# Check for LanceDB
HAS_LANCEDB = check_dependency('lancedb', 'pip install lancedb')

# Apply consistent styling
setup_style()

In [None]:
# Cell 7: Generate Test Data
NUM_VECTORS = 100_000
DIMENSIONS = 768
NUM_QUERIES = 1000
SEED = 42

print(f'Generating {NUM_VECTORS:,} vectors with {DIMENSIONS} dimensions...')

np.random.seed(SEED)

# Generate normalized random vectors
vectors = np.random.randn(NUM_VECTORS, DIMENSIONS).astype(np.float32)
vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

# Generate query vectors
queries = np.random.randn(NUM_QUERIES, DIMENSIONS).astype(np.float32)
queries = queries / np.linalg.norm(queries, axis=1, keepdims=True)

# Generate keys
keys = [f'doc_{i}' for i in range(NUM_VECTORS)]

print(f'✓ Generated {NUM_VECTORS:,} vectors')
print(f'✓ Generated {NUM_QUERIES:,} query vectors')

In [None]:
# Cell 8: Create Temp Directory
temp_dir = tempfile.mkdtemp(prefix='synadb_modern_benchmark_')
print(f'Using temp directory: {temp_dir}')

synadb_path = os.path.join(temp_dir, 'synadb_vectors.db')
qdrant_path = os.path.join(temp_dir, 'qdrant_db')
lancedb_path = os.path.join(temp_dir, 'lancedb')

## ⚡ Benchmark: Batch vs Single Insertion <a id="benchmark-insertion"></a>

Comparing single-item and batch insertion performance.

In [None]:
# Cell 10: SynaDB Insertion Benchmark
synadb_single_time = None
synadb_store = None

if HAS_SYNADB:
    from synadb import VectorStore
    
    print('Benchmarking SynaDB insertion...')
    
    # Single-item insertion
    synadb_store = VectorStore(synadb_path, dimensions=DIMENSIONS, metric='cosine')
    
    start = time.perf_counter()
    for i, (key, vec) in enumerate(zip(keys, vectors)):
        synadb_store.insert(key, vec)
        if (i + 1) % 20000 == 0:
            print(f'  Inserted {i + 1:,} vectors...')
    synadb_single_time = time.perf_counter() - start
    
    print(f'✓ SynaDB single-item: {NUM_VECTORS:,} vectors in {synadb_single_time:.2f}s')
    print(f'  Throughput: {NUM_VECTORS / synadb_single_time:,.0f} vectors/sec')
else:
    print('⚠️ SynaDB not available')

In [None]:
# Cell 11: Qdrant Insertion Benchmark
qdrant_single_time = None
qdrant_batch_time = None
qdrant_client = None

if HAS_QDRANT:
    from qdrant_client import QdrantClient
    from qdrant_client.models import VectorParams, Distance, PointStruct
    
    print('Benchmarking Qdrant insertion...')
    
    # Create Qdrant client (local mode)
    qdrant_client = QdrantClient(path=qdrant_path)
    
    # Create collection
    qdrant_client.create_collection(
        collection_name='benchmark',
        vectors_config=VectorParams(size=DIMENSIONS, distance=Distance.COSINE)
    )
    
    # Batch insertion (Qdrant is optimized for batches)
    BATCH_SIZE = 1000
    start = time.perf_counter()
    for i in range(0, NUM_VECTORS, BATCH_SIZE):
        end_idx = min(i + BATCH_SIZE, NUM_VECTORS)
        points = [
            PointStruct(id=j, vector=vectors[j].tolist())
            for j in range(i, end_idx)
        ]
        qdrant_client.upsert(collection_name='benchmark', points=points)
        if (end_idx) % 20000 == 0:
            print(f'  Inserted {end_idx:,} vectors...')
    qdrant_batch_time = time.perf_counter() - start
    
    print(f'✓ Qdrant batch: {NUM_VECTORS:,} vectors in {qdrant_batch_time:.2f}s')
    print(f'  Throughput: {NUM_VECTORS / qdrant_batch_time:,.0f} vectors/sec')
else:
    print('⚠️ Qdrant not available')

In [None]:
# Cell 12: LanceDB Insertion Benchmark
lancedb_batch_time = None
lance_db = None
lance_table = None

if HAS_LANCEDB:
    import lancedb
    import pyarrow as pa
    
    print('Benchmarking LanceDB insertion...')
    
    # Create LanceDB connection
    lance_db = lancedb.connect(lancedb_path)
    
    # Prepare data as list of dicts (LanceDB's preferred format)
    data = [
        {'id': keys[i], 'vector': vectors[i].tolist()}
        for i in range(NUM_VECTORS)
    ]
    
    # Batch insertion
    start = time.perf_counter()
    lance_table = lance_db.create_table('benchmark', data)
    lancedb_batch_time = time.perf_counter() - start
    
    print(f'✓ LanceDB batch: {NUM_VECTORS:,} vectors in {lancedb_batch_time:.2f}s')
    print(f'  Throughput: {NUM_VECTORS / lancedb_batch_time:,.0f} vectors/sec')
else:
    print('⚠️ LanceDB not available')

In [None]:
# Cell 13: Insertion Results Visualization
insertion_throughput = {}

if synadb_single_time:
    insertion_throughput['SynaDB'] = NUM_VECTORS / synadb_single_time

if qdrant_batch_time:
    insertion_throughput['Qdrant'] = NUM_VECTORS / qdrant_batch_time

if lancedb_batch_time:
    insertion_throughput['LanceDB'] = NUM_VECTORS / lancedb_batch_time

if insertion_throughput:
    fig = throughput_comparison(
        insertion_throughput,
        title=f'Insertion Throughput ({NUM_VECTORS:,} vectors)',
        ylabel='Vectors/second'
    )
    plt.show()

## 🔍 Benchmark: Search at Various k <a id="benchmark-search"></a>

Comparing search latency for k=1, k=10, and k=100.

In [None]:
# Cell 15: Search Benchmarks at Various k
K_VALUES = [1, 10, 100]
NUM_SEARCH_QUERIES = 100  # Sample for speed

search_results = {'SynaDB': {}, 'Qdrant': {}, 'LanceDB': {}}

# SynaDB Search
if HAS_SYNADB and synadb_store:
    print('Benchmarking SynaDB search...')
    for k in K_VALUES:
        times = []
        for query in queries[:NUM_SEARCH_QUERIES]:
            start = time.perf_counter()
            results = synadb_store.search(query, k=k)
            elapsed = (time.perf_counter() - start) * 1000
            times.append(elapsed)
        search_results['SynaDB'][k] = np.mean(times)
        print(f'  k={k}: {np.mean(times):.2f}ms')

# Qdrant Search
if HAS_QDRANT and qdrant_client:
    print('Benchmarking Qdrant search...')
    for k in K_VALUES:
        times = []
        for query in queries[:NUM_SEARCH_QUERIES]:
            start = time.perf_counter()
            results = qdrant_client.search(
                collection_name='benchmark',
                query_vector=query.tolist(),
                limit=k
            )
            elapsed = (time.perf_counter() - start) * 1000
            times.append(elapsed)
        search_results['Qdrant'][k] = np.mean(times)
        print(f'  k={k}: {np.mean(times):.2f}ms')

# LanceDB Search
if HAS_LANCEDB and lance_table:
    print('Benchmarking LanceDB search...')
    for k in K_VALUES:
        times = []
        for query in queries[:NUM_SEARCH_QUERIES]:
            start = time.perf_counter()
            results = lance_table.search(query.tolist()).limit(k).to_list()
            elapsed = (time.perf_counter() - start) * 1000
            times.append(elapsed)
        search_results['LanceDB'][k] = np.mean(times)
        print(f'  k={k}: {np.mean(times):.2f}ms')

In [None]:
# Cell 16: Search Results Visualization
# Create grouped bar chart for different k values
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, k in enumerate(K_VALUES):
    ax = axes[idx]
    data = {}
    for system in ['SynaDB', 'Qdrant', 'LanceDB']:
        if k in search_results.get(system, {}):
            data[system] = search_results[system][k]
    
    if data:
        colors = [COLORS['synadb'] if 'SynaDB' in name else COLORS['competitor'] for name in data.keys()]
        bars = ax.bar(data.keys(), data.values(), color=colors)
        ax.set_title(f'Search Latency (k={k})')
        ax.set_ylabel('Latency (ms)')
        
        # Add value labels
        for bar, val in zip(bars, data.values()):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(),
                   f'{val:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 📐 Distance Metrics Comparison <a id="distance-metrics"></a>

Comparing support for different distance metrics.

In [None]:
# Cell 18: Distance Metrics Comparison
from IPython.display import display, Markdown

metrics_table = '''
### Distance Metrics Support

| Metric | SynaDB | Qdrant | LanceDB |
|--------|--------|--------|--------|
| **Cosine** | ✅ | ✅ | ✅ |
| **Euclidean (L2)** | ✅ | ✅ | ✅ |
| **Dot Product** | ✅ | ✅ | ✅ |
| **Manhattan (L1)** | 🔜 | ✅ | ❌ |
| **Hamming** | ❌ | ✅ | ❌ |

### Usage Examples

**SynaDB:**
```python
store = VectorStore("db.db", dimensions=768, metric="cosine")  # or "euclidean", "dot_product"
```

**Qdrant:**
```python
from qdrant_client.models import Distance
VectorParams(size=768, distance=Distance.COSINE)  # or EUCLID, DOT
```

**LanceDB:**
```python
table.search(query).metric("cosine").limit(10)  # or "L2", "dot"
```
'''

display(Markdown(metrics_table))

## 🔄 Update/Delete Operations <a id="update-delete"></a>

Comparing update and delete performance.

In [None]:
# Cell 20: Update/Delete Benchmarks
NUM_UPDATES = 1000
update_results = {}
delete_results = {}

# Generate update vectors
update_vectors = np.random.randn(NUM_UPDATES, DIMENSIONS).astype(np.float32)
update_vectors = update_vectors / np.linalg.norm(update_vectors, axis=1, keepdims=True)
update_keys = [f'doc_{i}' for i in range(NUM_UPDATES)]

# SynaDB Update (re-insert with same key)
if HAS_SYNADB and synadb_store:
    print('Benchmarking SynaDB update/delete...')
    
    # Update (re-insert)
    start = time.perf_counter()
    for key, vec in zip(update_keys, update_vectors):
        synadb_store.insert(key, vec)  # Overwrites existing
    update_results['SynaDB'] = (time.perf_counter() - start) * 1000 / NUM_UPDATES
    print(f'  Update: {update_results["SynaDB"]:.3f}ms per vector')

# Qdrant Update
if HAS_QDRANT and qdrant_client:
    print('Benchmarking Qdrant update/delete...')
    
    # Update (upsert)
    start = time.perf_counter()
    points = [
        PointStruct(id=i, vector=update_vectors[i].tolist())
        for i in range(NUM_UPDATES)
    ]
    qdrant_client.upsert(collection_name='benchmark', points=points)
    update_results['Qdrant'] = (time.perf_counter() - start) * 1000 / NUM_UPDATES
    print(f'  Update: {update_results["Qdrant"]:.3f}ms per vector')
    
    # Delete
    start = time.perf_counter()
    qdrant_client.delete(
        collection_name='benchmark',
        points_selector=[i for i in range(NUM_UPDATES)]
    )
    delete_results['Qdrant'] = (time.perf_counter() - start) * 1000 / NUM_UPDATES
    print(f'  Delete: {delete_results["Qdrant"]:.3f}ms per vector')

# LanceDB Update
if HAS_LANCEDB and lance_table:
    print('Benchmarking LanceDB update/delete...')
    info_box('LanceDB uses versioning - updates create new versions rather than in-place modification.', 'Note')

In [None]:
# Cell 21: Update Results Visualization
if update_results:
    fig = bar_comparison(
        update_results,
        title='Update Latency (per vector)',
        ylabel='Latency (ms)',
        lower_is_better=True,
        value_format='{:.3f}'
    )
    plt.show()

## 💾 Storage Format Comparison <a id="storage"></a>

Comparing storage formats and file structures.

In [None]:
# Cell 23: Storage Format Comparison
def get_dir_size(path):
    total = 0
    if os.path.isfile(path):
        return os.path.getsize(path)
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total

def count_files(path):
    if os.path.isfile(path):
        return 1
    count = 0
    for dirpath, dirnames, filenames in os.walk(path):
        count += len(filenames)
    return count

print('Storage Format Comparison\n')
print('=' * 60)

storage_sizes = {}

# SynaDB
if os.path.exists(synadb_path):
    size = get_dir_size(synadb_path)
    files = count_files(synadb_path)
    storage_sizes['SynaDB'] = size / (1024 * 1024)
    print(f'SynaDB:')
    print(f'  Size: {size / (1024 * 1024):.1f} MB')
    print(f'  Files: {files} (single file)')
    print(f'  Format: Append-only log + HNSW index')

# Qdrant
if os.path.exists(qdrant_path):
    size = get_dir_size(qdrant_path)
    files = count_files(qdrant_path)
    storage_sizes['Qdrant'] = size / (1024 * 1024)
    print(f'\nQdrant:')
    print(f'  Size: {size / (1024 * 1024):.1f} MB')
    print(f'  Files: {files} (directory structure)')
    print(f'  Format: RocksDB + custom index')

# LanceDB
if os.path.exists(lancedb_path):
    size = get_dir_size(lancedb_path)
    files = count_files(lancedb_path)
    storage_sizes['LanceDB'] = size / (1024 * 1024)
    print(f'\nLanceDB:')
    print(f'  Size: {size / (1024 * 1024):.1f} MB')
    print(f'  Files: {files} (Lance format)')
    print(f'  Format: Columnar (Lance) + versioning')

print('\n' + '=' * 60)

In [None]:
# Cell 24: Storage Size Visualization
if storage_sizes:
    fig = memory_comparison(
        storage_sizes,
        title='Storage Size Comparison',
        ylabel='Size (MB)'
    )
    plt.show()

In [None]:
# Cell 25: Storage Features Comparison
storage_features = '''
### Storage Features Comparison

| Feature | SynaDB | Qdrant | LanceDB |
|---------|--------|--------|--------|
| **Format** | Single file | Directory | Directory (Lance) |
| **Portability** | Copy 1 file | Copy directory | Copy directory |
| **Versioning** | Via keys | Snapshots | Built-in |
| **Compression** | LZ4 | Optional | Built-in |
| **Cloud Storage** | Manual | S3 support | S3/GCS native |
| **DuckDB Integration** | 🔜 | ❌ | ✅ Native |
'''

display(Markdown(storage_features))

## 📈 Results Summary <a id="results"></a>

In [None]:
# Cell 27: Results Summary Table
summary_table = '''
### Benchmark Results Summary

| Metric | SynaDB | Qdrant | LanceDB |
|--------|--------|--------|--------|
| **Storage** | Single file | Directory | Directory |
| **Index Type** | HNSW | HNSW | IVF-PQ |
| **Batch Insert** | Good | Excellent | Excellent |
| **Single Insert** | Excellent | Good | Good |
| **Search (k=10)** | Fast | Fast | Fast |
| **Update** | Re-insert | Upsert | Versioned |
| **Delete** | Tombstone | Native | Versioned |
| **Filtering** | 🔜 | ✅ Rich | ✅ SQL-like |

### Unique Strengths

| System | Best For |
|--------|----------|
| **SynaDB** | Single-file simplicity, AI-native features (experiments, models) |
| **Qdrant** | Rich filtering, payload storage, Rust performance |
| **LanceDB** | Versioning, DuckDB integration, columnar analytics |
'''

display(Markdown(summary_table))

## 🎯 Conclusions <a id="conclusions"></a>

In [None]:
# Cell 29: Conclusions
conclusion_box(
    title='Key Takeaways',
    points=[
        '<b>SynaDB</b> offers the simplest deployment with single-file storage and integrated AI features',
        '<b>Qdrant</b> excels at filtering and payload management with excellent Rust performance',
        '<b>LanceDB</b> shines for versioning and analytics with native DuckDB integration',
        'All three systems provide excellent search performance with HNSW-style indexing',
        'Choose based on your specific needs: simplicity (SynaDB), filtering (Qdrant), or analytics (LanceDB)',
    ],
    summary='For AI/ML workflows with experiment tracking and model registry needs, SynaDB provides '
            'a unified solution. For complex filtering, choose Qdrant. For data versioning and '
            'SQL analytics, choose LanceDB.'
)

In [None]:
# Cell 30: Cleanup
try:
    shutil.rmtree(temp_dir)
    print(f'✓ Cleaned up temporary directory: {temp_dir}')
except Exception as e:
    print(f'⚠️ Could not clean up {temp_dir}: {e}')

print('\n🎉 Benchmark complete!')