In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box, info_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, throughput_comparison, memory_comparison, COLORS

display_header('Chunked Storage Comparison', 'SynaDB vs Zarr vs LMDB')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('Benchmark: Chunk Access', 'benchmark-chunk'),
    ('Benchmark: Compression', 'benchmark-compression'),
    ('Benchmark: Concurrent Access', 'benchmark-concurrent'),
    ('Cloud Storage Comparison', 'cloud-storage'),
    ('Append Operations Demo', 'append-operations'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## üìå Introduction <a id="introduction"></a>

This notebook compares **SynaDB** against **Zarr** and **LMDB** for chunked and memory-mapped storage:

| System | Type | Key Features |
|--------|------|-------------|
| **SynaDB** | Embedded DB | Single-file, append-only, native compression |
| **Zarr** | Array Store | Chunked N-dimensional arrays, cloud-native |
| **LMDB** | Key-Value Store | Memory-mapped, high read performance |

### What We'll Measure

- **Chunk access performance** (partial tensor loading)
- **Compression ratios** and decompression speeds
- **Concurrent access** patterns
- **Cloud storage** capabilities
- **Append operations** for streaming data

### Test Configuration

- **Dataset**: Large tensor (1GB equivalent)
- **Chunk size**: 1MB chunks
- **Compression**: LZ4 where supported

In [None]:
# Cell 4: System Info
display_system_info()

## üîß Setup <a id="setup"></a>

Let's set up our test environment with large tensor data.

In [None]:
# Cell 6: Check Dependencies and Imports
import numpy as np
import time
import os
import shutil
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt

# Check for SynaDB
HAS_SYNADB = check_dependency('synadb', 'pip install synadb')

# Check for Zarr
HAS_ZARR = check_dependency('zarr', 'pip install zarr')

# Check for LMDB
HAS_LMDB = check_dependency('lmdb', 'pip install lmdb')

# Apply consistent styling
setup_style()

In [None]:
# Cell 7: Generate Large Tensor Data
# Configuration - using smaller size for demo (100MB instead of 1GB)
TENSOR_SHAPE = (1000, 256, 256)  # ~250MB of float32 data
CHUNK_SIZE = (100, 256, 256)  # ~25MB chunks
SEED = 42

print(f'Generating tensor with shape {TENSOR_SHAPE}...')
np.random.seed(SEED)

# Generate random tensor data (float32)
tensor_data = np.random.randn(*TENSOR_SHAPE).astype(np.float32)

print(f'‚úì Generated tensor shape: {tensor_data.shape}')
print(f'‚úì Data type: {tensor_data.dtype}')
print(f'‚úì Memory usage: {tensor_data.nbytes / 1024 / 1024:.1f} MB')

In [None]:
# Cell 8: Create Temp Directory
temp_dir = tempfile.mkdtemp(prefix='synadb_chunked_')
print(f'Using temp directory: {temp_dir}')

synadb_path = os.path.join(temp_dir, 'tensor_synadb.db')
zarr_path = os.path.join(temp_dir, 'tensor.zarr')
lmdb_path = os.path.join(temp_dir, 'tensor.lmdb')

In [None]:
# Cell 9: Save Data to SynaDB
synadb_write_time = None

if HAS_SYNADB:
    from synadb import SynaDB
    print('Saving data to SynaDB...')
    start = time.perf_counter()
    with SynaDB(synadb_path) as db:
        # Store tensor in chunks
        num_chunks = TENSOR_SHAPE[0] // CHUNK_SIZE[0]
        for i in range(num_chunks):
            chunk = tensor_data[i*CHUNK_SIZE[0]:(i+1)*CHUNK_SIZE[0]]
            db.put_bytes(f'tensor/chunk_{i}', chunk.tobytes())
        # Store metadata
        db.put_text('tensor/shape', str(TENSOR_SHAPE))
        db.put_text('tensor/dtype', str(tensor_data.dtype))
        db.put_int('tensor/num_chunks', num_chunks)
    synadb_write_time = time.perf_counter() - start
    print(f'‚úì SynaDB: {num_chunks} chunks in {synadb_write_time:.2f}s')
else:
    print('‚ö†Ô∏è SynaDB not available, skipping...')

In [None]:
# Cell 10: Save Data to Zarr
zarr_write_time = None

if HAS_ZARR:
    import zarr
    print('Saving data to Zarr...')
    start = time.perf_counter()
    # Create Zarr array with chunking and compression
    z = zarr.open(zarr_path, mode='w', shape=TENSOR_SHAPE, chunks=CHUNK_SIZE, dtype='float32', compressor=zarr.Blosc(cname='lz4', clevel=5))
    z[:] = tensor_data
    zarr_write_time = time.perf_counter() - start
    print(f'‚úì Zarr: Written in {zarr_write_time:.2f}s')
    print(f'  Chunks: {z.chunks}')
    print(f'  Compressor: {z.compressor}')
else:
    print('‚ö†Ô∏è Zarr not available, skipping...')

In [None]:
# Cell 11: Save Data to LMDB
lmdb_write_time = None

if HAS_LMDB:
    import lmdb
    print('Saving data to LMDB...')
    start = time.perf_counter()
    # LMDB requires pre-allocated map size
    map_size = tensor_data.nbytes * 2  # 2x for safety
    env = lmdb.open(lmdb_path, map_size=map_size)
    with env.begin(write=True) as txn:
        num_chunks = TENSOR_SHAPE[0] // CHUNK_SIZE[0]
        for i in range(num_chunks):
            chunk = tensor_data[i*CHUNK_SIZE[0]:(i+1)*CHUNK_SIZE[0]]
            txn.put(f'chunk_{i}'.encode(), chunk.tobytes())
        txn.put(b'metadata/shape', str(TENSOR_SHAPE).encode())
        txn.put(b'metadata/dtype', str(tensor_data.dtype).encode())
    env.close()
    lmdb_write_time = time.perf_counter() - start
    print(f'‚úì LMDB: {num_chunks} chunks in {lmdb_write_time:.2f}s')
else:
    print('‚ö†Ô∏è LMDB not available, skipping...')

In [None]:
# Cell 12: Write Time Comparison
write_times = {}
if synadb_write_time: write_times['SynaDB'] = synadb_write_time
if zarr_write_time: write_times['Zarr'] = zarr_write_time
if lmdb_write_time: write_times['LMDB'] = lmdb_write_time

if write_times:
    fig = bar_comparison(write_times, title='Write Time (Large Tensor)', ylabel='Time (seconds)', lower_is_better=True)
    plt.show()

## üì¶ Benchmark: Chunk Access <a id="benchmark-chunk"></a>

Let's measure partial tensor loading performance - reading individual chunks.

In [None]:
# Cell 14: Chunk Access Setup
NUM_CHUNK_READS = 50
np.random.seed(SEED)
num_chunks = TENSOR_SHAPE[0] // CHUNK_SIZE[0]
random_chunk_indices = np.random.randint(0, num_chunks, size=NUM_CHUNK_READS)
print(f'Testing {NUM_CHUNK_READS} random chunk reads...')

In [None]:
# Cell 15: SynaDB Chunk Access
synadb_chunk_times = []

if HAS_SYNADB:
    from synadb import SynaDB
    print('Benchmarking SynaDB chunk access...')
    with SynaDB(synadb_path) as db:
        # Warm up
        for _ in range(5): _ = db.get_bytes('tensor/chunk_0')
        # Benchmark
        for idx in random_chunk_indices:
            start = time.perf_counter()
            chunk_bytes = db.get_bytes(f'tensor/chunk_{idx}')
            chunk = np.frombuffer(chunk_bytes, dtype=np.float32).reshape(CHUNK_SIZE)
            elapsed = (time.perf_counter() - start) * 1000
            synadb_chunk_times.append(elapsed)
    print(f'‚úì SynaDB: Mean chunk read {np.mean(synadb_chunk_times):.2f}ms')
    print(f'  P95: {np.percentile(synadb_chunk_times, 95):.2f}ms')

In [None]:
# Cell 16: Zarr Chunk Access
zarr_chunk_times = []

if HAS_ZARR:
    import zarr
    print('Benchmarking Zarr chunk access...')
    z = zarr.open(zarr_path, mode='r')
    # Warm up
    for _ in range(5): _ = z[0:CHUNK_SIZE[0]]
    # Benchmark
    for idx in random_chunk_indices:
        start_idx = idx * CHUNK_SIZE[0]
        end_idx = start_idx + CHUNK_SIZE[0]
        start = time.perf_counter()
        chunk = z[start_idx:end_idx]
        elapsed = (time.perf_counter() - start) * 1000
        zarr_chunk_times.append(elapsed)
    print(f'‚úì Zarr: Mean chunk read {np.mean(zarr_chunk_times):.2f}ms')
    print(f'  P95: {np.percentile(zarr_chunk_times, 95):.2f}ms')

In [None]:
# Cell 17: LMDB Chunk Access
lmdb_chunk_times = []

if HAS_LMDB:
    import lmdb
    print('Benchmarking LMDB chunk access...')
    env = lmdb.open(lmdb_path, readonly=True)
    with env.begin() as txn:
        # Warm up
        for _ in range(5): _ = txn.get(b'chunk_0')
        # Benchmark
        for idx in random_chunk_indices:
            start = time.perf_counter()
            chunk_bytes = txn.get(f'chunk_{idx}'.encode())
            chunk = np.frombuffer(chunk_bytes, dtype=np.float32).reshape(CHUNK_SIZE)
            elapsed = (time.perf_counter() - start) * 1000
            lmdb_chunk_times.append(elapsed)
    env.close()
    print(f'‚úì LMDB: Mean chunk read {np.mean(lmdb_chunk_times):.2f}ms')
    print(f'  P95: {np.percentile(lmdb_chunk_times, 95):.2f}ms')

In [None]:
# Cell 18: Chunk Access Results
chunk_latencies = {}
if synadb_chunk_times: chunk_latencies['SynaDB'] = np.mean(synadb_chunk_times)
if zarr_chunk_times: chunk_latencies['Zarr'] = np.mean(zarr_chunk_times)
if lmdb_chunk_times: chunk_latencies['LMDB'] = np.mean(lmdb_chunk_times)

if chunk_latencies:
    fig = bar_comparison(chunk_latencies, title='Chunk Read Latency', ylabel='Latency (ms)', lower_is_better=True)
    plt.show()

## üóúÔ∏è Benchmark: Compression <a id="benchmark-compression"></a>

Let's compare compression ratios and decompression speeds.

In [None]:
# Cell 20: Compression Ratio Comparison
def get_size(path):
    if os.path.isfile(path):
        return os.path.getsize(path)
    total = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            total += os.path.getsize(os.path.join(dirpath, f))
    return total

original_size = tensor_data.nbytes
print(f'Original tensor size: {original_size / 1024 / 1024:.1f} MB\n')

compression_ratios = {}
storage_sizes = {}

if os.path.exists(synadb_path):
    size = get_size(synadb_path)
    storage_sizes['SynaDB'] = size / (1024 * 1024)
    compression_ratios['SynaDB'] = original_size / size
    print(f'SynaDB: {size / 1024 / 1024:.1f} MB (ratio: {original_size / size:.2f}x)')

if os.path.exists(zarr_path):
    size = get_size(zarr_path)
    storage_sizes['Zarr'] = size / (1024 * 1024)
    compression_ratios['Zarr'] = original_size / size
    print(f'Zarr: {size / 1024 / 1024:.1f} MB (ratio: {original_size / size:.2f}x)')

if os.path.exists(lmdb_path):
    size = get_size(lmdb_path)
    storage_sizes['LMDB'] = size / (1024 * 1024)
    compression_ratios['LMDB'] = original_size / size
    print(f'LMDB: {size / 1024 / 1024:.1f} MB (ratio: {original_size / size:.2f}x)')

In [None]:
# Cell 21: Storage Size Visualization
if storage_sizes:
    fig = memory_comparison(storage_sizes, title='Storage Size Comparison', ylabel='Size (MB)')
    plt.show()

In [None]:
# Cell 22: Compression Ratio Visualization
if compression_ratios:
    fig = bar_comparison(compression_ratios, title='Compression Ratio (Higher is Better)', ylabel='Compression Ratio', lower_is_better=False)
    plt.show()

## üîÄ Benchmark: Concurrent Access <a id="benchmark-concurrent"></a>

Let's test multi-process data loading patterns.

In [None]:
# Cell 24: Concurrent Access Demo
import threading
import queue

NUM_THREADS = 4
READS_PER_THREAD = 10

print(f'Testing concurrent access with {NUM_THREADS} threads, {READS_PER_THREAD} reads each...')

In [None]:
# Cell 25: SynaDB Concurrent Access
synadb_concurrent_time = None

if HAS_SYNADB:
    from synadb import SynaDB
    results_queue = queue.Queue()
    
    def synadb_reader(thread_id, db_path, indices):
        times = []
        with SynaDB(db_path) as db:
            for idx in indices:
                start = time.perf_counter()
                _ = db.get_bytes(f'tensor/chunk_{idx % num_chunks}')
                times.append(time.perf_counter() - start)
        results_queue.put(times)
    
    print('Benchmarking SynaDB concurrent access...')
    threads = []
    start = time.perf_counter()
    for i in range(NUM_THREADS):
        indices = list(range(i * READS_PER_THREAD, (i + 1) * READS_PER_THREAD))
        t = threading.Thread(target=synadb_reader, args=(i, synadb_path, indices))
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    synadb_concurrent_time = time.perf_counter() - start
    print(f'‚úì SynaDB: {NUM_THREADS * READS_PER_THREAD} reads in {synadb_concurrent_time:.3f}s')

In [None]:
# Cell 26: LMDB Concurrent Access
lmdb_concurrent_time = None

if HAS_LMDB:
    import lmdb
    
    def lmdb_reader(thread_id, db_path, indices):
        env = lmdb.open(db_path, readonly=True)
        with env.begin() as txn:
            for idx in indices:
                _ = txn.get(f'chunk_{idx % num_chunks}'.encode())
        env.close()
    
    print('Benchmarking LMDB concurrent access...')
    threads = []
    start = time.perf_counter()
    for i in range(NUM_THREADS):
        indices = list(range(i * READS_PER_THREAD, (i + 1) * READS_PER_THREAD))
        t = threading.Thread(target=lmdb_reader, args=(i, lmdb_path, indices))
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    lmdb_concurrent_time = time.perf_counter() - start
    print(f'‚úì LMDB: {NUM_THREADS * READS_PER_THREAD} reads in {lmdb_concurrent_time:.3f}s')

In [None]:
# Cell 27: Concurrent Access Results
concurrent_times = {}
if synadb_concurrent_time: concurrent_times['SynaDB'] = synadb_concurrent_time
if lmdb_concurrent_time: concurrent_times['LMDB'] = lmdb_concurrent_time

if concurrent_times:
    fig = bar_comparison(concurrent_times, title=f'Concurrent Access ({NUM_THREADS} threads)', ylabel='Time (seconds)', lower_is_better=True)
    plt.show()

## ‚òÅÔ∏è Cloud Storage Comparison <a id="cloud-storage"></a>

Zarr is designed for cloud-native storage. Let's compare the cloud storage capabilities.

In [None]:
# Cell 29: Cloud Storage Capabilities
print('Cloud Storage Capabilities Comparison\n')
print('=' * 70)

print('\nüì¶ SynaDB:')
print('  - Single file storage (easy to sync)')
print('  - Can be stored on any cloud storage (S3, GCS, Azure Blob)')
print('  - Requires full file download for access')
print('  - Best for: Local-first workflows with cloud backup')

print('\nüì¶ Zarr:')
print('  - Native cloud storage support (S3, GCS, Azure)')
print('  - Chunk-level access (only download needed chunks)')
print('  - Parallel chunk downloads')
print('  - Best for: Large datasets accessed from cloud')

print('\nüì¶ LMDB:')
print('  - Memory-mapped files (local only)')
print('  - No native cloud support')
print('  - Requires full database download')
print('  - Best for: High-performance local access')

print('\n' + '=' * 70)

In [None]:
# Cell 30: Zarr Cloud Storage Demo (Simulated)
if HAS_ZARR:
    import zarr
    print('Zarr Cloud Storage Features:\n')
    
    # Show Zarr's store options
    print('Available Zarr stores:')
    print('  - DirectoryStore (local filesystem)')
    print('  - ZipStore (single zip file)')
    print('  - S3Store (Amazon S3) - requires s3fs')
    print('  - GCSStore (Google Cloud Storage) - requires gcsfs')
    print('  - ABSStore (Azure Blob Storage) - requires adlfs')
    
    # Demo with ZipStore (portable single file)
    zip_path = os.path.join(temp_dir, 'tensor.zarr.zip')
    print(f'\nCreating ZipStore demo at {zip_path}...')
    
    store = zarr.ZipStore(zip_path, mode='w')
    z = zarr.open(store, mode='w', shape=(100, 256, 256), chunks=(10, 256, 256), dtype='float32')
    z[:] = tensor_data[:100]  # Store subset
    store.close()
    
    zip_size = os.path.getsize(zip_path) / (1024 * 1024)
    print(f'‚úì ZipStore created: {zip_size:.1f} MB')
    print('  ‚Üí Single portable file like SynaDB!')
else:
    print('‚ö†Ô∏è Zarr not available, skipping cloud storage demo...')

## ‚ûï Append Operations Demo <a id="append-operations"></a>

Let's compare how each system handles appending new data - critical for streaming ML workloads.

In [None]:
# Cell 32: Append Operations Setup
NUM_APPENDS = 100
APPEND_CHUNK_SIZE = (10, 256, 256)
print(f'Testing {NUM_APPENDS} append operations with chunk size {APPEND_CHUNK_SIZE}...')

In [None]:
# Cell 33: SynaDB Append Operations
synadb_append_time = None

if HAS_SYNADB:
    from synadb import SynaDB
    append_db_path = os.path.join(temp_dir, 'append_test.db')
    print('Benchmarking SynaDB append operations...')
    
    start = time.perf_counter()
    with SynaDB(append_db_path) as db:
        for i in range(NUM_APPENDS):
            chunk = np.random.randn(*APPEND_CHUNK_SIZE).astype(np.float32)
            db.put_bytes(f'stream/chunk_{i}', chunk.tobytes())
    synadb_append_time = time.perf_counter() - start
    
    print(f'‚úì SynaDB: {NUM_APPENDS} appends in {synadb_append_time:.3f}s')
    print(f'  Throughput: {NUM_APPENDS / synadb_append_time:.0f} appends/sec')

In [None]:
# Cell 34: Zarr Append Operations
zarr_append_time = None

if HAS_ZARR:
    import zarr
    append_zarr_path = os.path.join(temp_dir, 'append_test.zarr')
    print('Benchmarking Zarr append operations...')
    
    # Zarr requires pre-allocation or resize
    start = time.perf_counter()
    z = zarr.open(append_zarr_path, mode='w', shape=(0, 256, 256), chunks=(10, 256, 256), dtype='float32')
    for i in range(NUM_APPENDS):
        chunk = np.random.randn(*APPEND_CHUNK_SIZE).astype(np.float32)
        z.append(chunk, axis=0)
    zarr_append_time = time.perf_counter() - start
    
    print(f'‚úì Zarr: {NUM_APPENDS} appends in {zarr_append_time:.3f}s')
    print(f'  Throughput: {NUM_APPENDS / zarr_append_time:.0f} appends/sec')
    print(f'  Final shape: {z.shape}')

In [None]:
# Cell 35: LMDB Append Operations
lmdb_append_time = None

if HAS_LMDB:
    import lmdb
    append_lmdb_path = os.path.join(temp_dir, 'append_test.lmdb')
    print('Benchmarking LMDB append operations...')
    
    # LMDB requires pre-allocated map size
    map_size = NUM_APPENDS * np.prod(APPEND_CHUNK_SIZE) * 4 * 2  # 2x safety margin
    env = lmdb.open(append_lmdb_path, map_size=map_size)
    
    start = time.perf_counter()
    for i in range(NUM_APPENDS):
        chunk = np.random.randn(*APPEND_CHUNK_SIZE).astype(np.float32)
        with env.begin(write=True) as txn:
            txn.put(f'chunk_{i}'.encode(), chunk.tobytes())
    lmdb_append_time = time.perf_counter() - start
    env.close()
    
    print(f'‚úì LMDB: {NUM_APPENDS} appends in {lmdb_append_time:.3f}s')
    print(f'  Throughput: {NUM_APPENDS / lmdb_append_time:.0f} appends/sec')

In [None]:
# Cell 36: Append Operations Results
append_throughput = {}
if synadb_append_time: append_throughput['SynaDB'] = NUM_APPENDS / synadb_append_time
if zarr_append_time: append_throughput['Zarr'] = NUM_APPENDS / zarr_append_time
if lmdb_append_time: append_throughput['LMDB'] = NUM_APPENDS / lmdb_append_time

if append_throughput:
    fig = throughput_comparison(append_throughput, title='Append Throughput', ylabel='Appends/second')
    plt.show()

## üìä Results Summary <a id="results"></a>

Let's summarize all benchmark results.

In [None]:
# Cell 38: Results Summary Table
from IPython.display import display, Markdown

# Build summary table
summary_md = '''\n| Metric | SynaDB | Zarr | LMDB |\n|--------|--------|------|------|\n'''

# Write time
synadb_wt = f'{synadb_write_time:.2f}s' if synadb_write_time else 'N/A'
zarr_wt = f'{zarr_write_time:.2f}s' if zarr_write_time else 'N/A'
lmdb_wt = f'{lmdb_write_time:.2f}s' if lmdb_write_time else 'N/A'
summary_md += f'| Write Time | {synadb_wt} | {zarr_wt} | {lmdb_wt} |\n'

# Chunk access
synadb_ca = f'{np.mean(synadb_chunk_times):.2f}ms' if synadb_chunk_times else 'N/A'
zarr_ca = f'{np.mean(zarr_chunk_times):.2f}ms' if zarr_chunk_times else 'N/A'
lmdb_ca = f'{np.mean(lmdb_chunk_times):.2f}ms' if lmdb_chunk_times else 'N/A'
summary_md += f'| Chunk Read Latency | {synadb_ca} | {zarr_ca} | {lmdb_ca} |\n'

# Compression ratio
synadb_cr = f'{compression_ratios.get("SynaDB", 0):.2f}x' if 'SynaDB' in compression_ratios else 'N/A'
zarr_cr = f'{compression_ratios.get("Zarr", 0):.2f}x' if 'Zarr' in compression_ratios else 'N/A'
lmdb_cr = f'{compression_ratios.get("LMDB", 0):.2f}x' if 'LMDB' in compression_ratios else 'N/A'
summary_md += f'| Compression Ratio | {synadb_cr} | {zarr_cr} | {lmdb_cr} |\n'

# Concurrent access
synadb_cc = f'{synadb_concurrent_time:.3f}s' if synadb_concurrent_time else 'N/A'
lmdb_cc = f'{lmdb_concurrent_time:.3f}s' if lmdb_concurrent_time else 'N/A'
summary_md += f'| Concurrent Access | {synadb_cc} | N/A | {lmdb_cc} |\n'

# Append throughput
synadb_at = f'{NUM_APPENDS/synadb_append_time:.0f}/s' if synadb_append_time else 'N/A'
zarr_at = f'{NUM_APPENDS/zarr_append_time:.0f}/s' if zarr_append_time else 'N/A'
lmdb_at = f'{NUM_APPENDS/lmdb_append_time:.0f}/s' if lmdb_append_time else 'N/A'
summary_md += f'| Append Throughput | {synadb_at} | {zarr_at} | {lmdb_at} |\n'

# Features
summary_md += '| Cloud Native | ‚óã Sync | ‚úì Yes | ‚úó No |\n'
summary_md += '| Single File | ‚úì Yes | ‚óã Optional | ‚úó No |\n'
summary_md += '| Compression | ‚úì LZ4 | ‚úì Multiple | ‚úó No |\n'

display(Markdown(summary_md))

In [None]:
# Cell 39: Final Storage Comparison
final_sizes = {}
if os.path.exists(synadb_path): final_sizes['SynaDB'] = get_size(synadb_path) / (1024 * 1024)
if os.path.exists(zarr_path): final_sizes['Zarr'] = get_size(zarr_path) / (1024 * 1024)
if os.path.exists(lmdb_path): final_sizes['LMDB'] = get_size(lmdb_path) / (1024 * 1024)

if final_sizes:
    fig = memory_comparison(final_sizes, title='Final Storage Size', ylabel='Size (MB)')
    plt.show()
    print('\nStorage sizes:')
    for name, size in final_sizes.items():
        print(f'  {name}: {size:.1f} MB')

## üéØ Conclusions <a id="conclusions"></a>

In [None]:
# Cell 41: Conclusions
conclusion_box(
    title='Key Takeaways',
    points=[
        'Zarr excels at cloud-native storage with chunk-level access',
        'LMDB provides fastest local read performance via memory mapping',
        'SynaDB offers single-file simplicity with good compression',
        'For streaming/append workloads, SynaDB\'s append-only design is ideal',
        'Zarr is best for large scientific datasets accessed from cloud',
        'LMDB is best for high-performance local key-value access',
    ],
    summary='Choose based on deployment: Zarr for cloud, LMDB for local performance, SynaDB for unified ML workflows with single-file portability.'
)

In [None]:
# Cell 42: Cleanup
import shutil
try:
    shutil.rmtree(temp_dir)
    print(f'‚úì Cleaned up temp directory: {temp_dir}')
except Exception as e:
    print(f'‚ö†Ô∏è Could not clean up: {e}')