In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box, info_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, throughput_comparison, memory_comparison, COLORS

display_header('Embedded Vector Store Comparison', 'SynaDB vs Chroma')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('Benchmark: Insertion', 'benchmark-insertion'),
    ('Benchmark: Search', 'benchmark-search'),
    ('Benchmark: Recall@k', 'benchmark-recall'),
    ('Demo: RAG Pipeline', 'demo-rag'),
    ('Persistence Comparison', 'persistence'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## 📌 Introduction <a id="introduction"></a>

This notebook compares **SynaDB** against **Chroma**, two popular embedded vector databases.

| System | Type | Key Features |
|--------|------|-------------|
| **SynaDB** | Embedded | Single-file, AI-native, HNSW index, FAISS backend option |
| **Chroma** | Embedded | Popular for LLM apps, directory-based storage |

### Why These Two?

Both are **embedded** vector databases requiring no server. They target:
- Local RAG applications
- Development and prototyping
- Single-machine deployments

### What We'll Measure

- **Insertion throughput** (vectors/sec)
- **Search latency** (ms)
- **Recall@k** (search quality)
- **Storage size** on disk

### Test Configuration

- **Dataset**: 100,000 synthetic embeddings
- **Dimensions**: 768 (sentence transformers)
- **Queries**: 1,000 random queries

> **Note**: For billion-scale search, SynaDB supports FAISS as an optional backend.
> See `02_faiss_backend.ipynb` for details on using `--features faiss`.

In [None]:
display_system_info()

## 🔧 Setup <a id="setup"></a>

Setting up test environment with 100K synthetic embeddings.

In [None]:
# Create temporary directory for benchmark data
import numpy as np
import time
import os
import tempfile
import matplotlib.pyplot as plt

HAS_SYNADB = check_dependency('synadb', 'pip install synadb')
HAS_CHROMA = check_dependency('chromadb', 'pip install chromadb')
setup_style()

In [None]:
# Configure benchmark parameters and generate test data
NUM_VECTORS = 100_000
DIMENSIONS = 768
NUM_QUERIES = 1000
SEED = 42

print(f'Generating {NUM_VECTORS:,} vectors...')
np.random.seed(SEED)

vectors = np.random.randn(NUM_VECTORS, DIMENSIONS).astype(np.float32)
vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

queries = np.random.randn(NUM_QUERIES, DIMENSIONS).astype(np.float32)
queries = queries / np.linalg.norm(queries, axis=1, keepdims=True)

keys = [f'doc_{i}' for i in range(NUM_VECTORS)]
print(f'✓ Generated {NUM_VECTORS:,} vectors ({vectors.nbytes / 1024 / 1024:.1f} MB)')

In [None]:
temp_dir = tempfile.mkdtemp(prefix='synadb_benchmark_')
synadb_path = os.path.join(temp_dir, 'synadb.db')
chroma_path = os.path.join(temp_dir, 'chroma_db')
print(f'Temp directory: {temp_dir}')

## ⚡ Benchmark: Insertion <a id="benchmark-insertion"></a>

In [None]:
# Configure benchmark parameters and generate test data
synadb_insert_time = None
synadb_store = None

if HAS_SYNADB:
    from synadb import VectorStore
    print('Benchmarking SynaDB insertion...')
    synadb_store = VectorStore(synadb_path, dimensions=DIMENSIONS, metric='cosine')
    
    start = time.perf_counter()
    for i, (key, vec) in enumerate(zip(keys, vectors)):
        synadb_store.insert(key, vec)
        if (i + 1) % 20000 == 0:
            print(f'  Inserted {i + 1:,}...')
    synadb_insert_time = time.perf_counter() - start
    print(f'✓ SynaDB: {NUM_VECTORS:,} vectors in {synadb_insert_time:.2f}s ({NUM_VECTORS/synadb_insert_time:,.0f} vec/s)')

In [None]:
# Configure benchmark parameters and generate test data
chroma_insert_time = None
chroma_collection = None

if HAS_CHROMA:
    import chromadb
    print('Benchmarking Chroma insertion...')
    client = chromadb.PersistentClient(path=chroma_path)
    chroma_collection = client.create_collection('benchmark', metadata={'hnsw:space': 'cosine'})
    
    BATCH = 5000
    start = time.perf_counter()
    for i in range(0, NUM_VECTORS, BATCH):
        end = min(i + BATCH, NUM_VECTORS)
        chroma_collection.add(ids=keys[i:end], embeddings=vectors[i:end].tolist())
        if end % 20000 == 0:
            print(f'  Inserted {end:,}...')
    chroma_insert_time = time.perf_counter() - start
    print(f'✓ Chroma: {NUM_VECTORS:,} vectors in {chroma_insert_time:.2f}s ({NUM_VECTORS/chroma_insert_time:,.0f} vec/s)')

In [None]:
# Configure benchmark parameters and generate test data
throughput = {}
if synadb_insert_time: throughput['SynaDB'] = NUM_VECTORS / synadb_insert_time
if chroma_insert_time: throughput['Chroma'] = NUM_VECTORS / chroma_insert_time
if throughput:
    throughput_comparison(throughput, title='Insertion Throughput', ylabel='Vectors/sec')
    plt.show()

## 🔍 Benchmark: Search <a id="benchmark-search"></a>

In [None]:
synadb_times, synadb_results = [], []
if HAS_SYNADB and synadb_store:
    print('Benchmarking SynaDB search...')
    for _ in range(5): synadb_store.search(queries[0], k=10)  # warmup
    for i, q in enumerate(queries):
        start = time.perf_counter()
        results = synadb_store.search(q, k=10)
        synadb_times.append((time.perf_counter() - start) * 1000)
        synadb_results.append([r.key for r in results])
    print(f'✓ SynaDB: mean={np.mean(synadb_times):.2f}ms, p95={np.percentile(synadb_times, 95):.2f}ms')

In [None]:
# Benchmark search performance with warmup
chroma_times, chroma_results = [], []
if HAS_CHROMA and chroma_collection:
    print('Benchmarking Chroma search...')
    for _ in range(5): chroma_collection.query(query_embeddings=[queries[0].tolist()], n_results=10)
    for i, q in enumerate(queries):
        start = time.perf_counter()
        res = chroma_collection.query(query_embeddings=[q.tolist()], n_results=10)
        chroma_times.append((time.perf_counter() - start) * 1000)
        chroma_results.append(res['ids'][0])
    print(f'✓ Chroma: mean={np.mean(chroma_times):.2f}ms, p95={np.percentile(chroma_times, 95):.2f}ms')

In [None]:
# Benchmark search performance with warmup
latencies = {}
if synadb_times: latencies['SynaDB'] = np.mean(synadb_times)
if chroma_times: latencies['Chroma'] = np.mean(chroma_times)
if latencies:
    bar_comparison(latencies, title='Search Latency (k=10)', ylabel='ms', lower_is_better=True)
    plt.show()

## 📊 Benchmark: Recall@k <a id="benchmark-recall"></a>

In [None]:
# Compute ground truth for recall calculation
print('Computing ground truth...')
ground_truth = []
for q in queries:
    sims = np.dot(vectors, q)
    top_idx = np.argsort(sims)[-10:][::-1]
    ground_truth.append([keys[i] for i in top_idx])
print(f'✓ Ground truth computed')

In [None]:
# Compute ground truth for recall calculation
def calc_recall(pred, gt, k=10):
    recalls = [len(set(p[:k]) & set(g[:k])) / k for p, g in zip(pred, gt)]
    return np.mean(recalls)

recall = {}
if synadb_results: recall['SynaDB'] = calc_recall(synadb_results, ground_truth)
if chroma_results: recall['Chroma'] = calc_recall(chroma_results, ground_truth)
for name, val in recall.items():
    print(f'{name} Recall@10: {val:.4f}')

## 🤖 Demo: RAG Pipeline <a id="demo-rag"></a>

In [None]:
# Demonstrate RAG retrieval pipeline
print('RAG Demo: Top-3 retrieval\n' + '='*50)
q = queries[0]
if HAS_SYNADB and synadb_store:
    print('\n📦 SynaDB:')
    for r in synadb_store.search(q, k=3):
        print(f'  {r.key} (score: {r.score:.4f})')
if HAS_CHROMA and chroma_collection:
    print('\n📦 Chroma:')
    res = chroma_collection.query(query_embeddings=[q.tolist()], n_results=3)
    for id, dist in zip(res['ids'][0], res['distances'][0]):
        print(f'  {id} (distance: {dist:.4f})')

## 💾 Persistence Comparison <a id="persistence"></a>

In [None]:
# Demonstrate RAG retrieval pipeline
def dir_size(path):
    if os.path.isfile(path): return os.path.getsize(path)
    total = 0
    for dp, dn, fn in os.walk(path):
        for f in fn: total += os.path.getsize(os.path.join(dp, f))
    return total

storage = {}
if os.path.exists(synadb_path):
    storage['SynaDB'] = dir_size(synadb_path) / 1024 / 1024
    print(f'SynaDB: {storage["SynaDB"]:.1f} MB (single file)')
if os.path.exists(chroma_path):
    storage['Chroma'] = dir_size(chroma_path) / 1024 / 1024
    print(f'Chroma: {storage["Chroma"]:.1f} MB (directory)')

if storage:
    memory_comparison(storage, title='Storage Size', ylabel='MB')
    plt.show()

## 📈 Results Summary <a id="results"></a>

In [None]:
# Visualize search latency comparison
from IPython.display import display, Markdown

md = '| Metric | SynaDB | Chroma |\n|--------|--------|--------|\n'
if throughput:
    md += f'| Insert (vec/s) | {throughput.get("SynaDB", "N/A"):,.0f} | {throughput.get("Chroma", "N/A"):,.0f} |\n'
if latencies:
    md += f'| Search (ms) | {latencies.get("SynaDB", "N/A"):.2f} | {latencies.get("Chroma", "N/A"):.2f} |\n'
if recall:
    md += f'| Recall@10 | {recall.get("SynaDB", "N/A"):.4f} | {recall.get("Chroma", "N/A"):.4f} |\n'
if storage:
    md += f'| Storage (MB) | {storage.get("SynaDB", "N/A"):.1f} | {storage.get("Chroma", "N/A"):.1f} |\n'
display(Markdown(md))

## 🎯 Conclusions <a id="conclusions"></a>

In [None]:
# Calculate and display recall@k metrics
conclusion_box(
    title='Key Takeaways',
    points=[
        '<b>SynaDB</b> uses single-file storage vs Chroma\'s directory structure',
        'Both achieve high recall with HNSW indexing',
        'SynaDB includes experiment tracking, model registry, and tensor engine',
        'For billion-scale, SynaDB supports FAISS as an optional backend',
    ],
    summary='Choose SynaDB for unified AI data layer with zero config. '
            'Choose Chroma for quick LangChain prototyping.'
)

In [None]:
# Create temporary directory for benchmark data
import shutil
try:
    shutil.rmtree(temp_dir)
    print(f'✓ Cleaned up {temp_dir}')
except: pass
print('\n🎉 Benchmark complete!')