In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, throughput_comparison, COLORS

display_header('Model Registry Comparison', 'SynaDB vs MLflow Model Registry vs DVC')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('Benchmark: Model Save/Load', 'benchmark-save-load'),
    ('Demo: Version Management', 'demo-version'),
    ('Demo: Stage Promotion', 'demo-stage'),
    ('Demo: Rollback', 'demo-rollback'),
    ('Integrity Guarantees', 'integrity'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## 📌 Introduction <a id="introduction"></a>

This notebook compares **SynaDB's ModelRegistry** against **MLflow Model Registry** and **DVC**.

| System | Type | Key Features |
|--------|------|-------------|
| **SynaDB** | Embedded | Single-file, SHA-256 checksums, stage management |
| **MLflow** | Server-based | Industry standard, rich UI, model serving |
| **DVC** | Git-based | Version control for data/models, remote storage |

### What We'll Measure

- **Model save/load** latency and throughput
- **Version management** capabilities
- **Stage promotion** workflows
- **Rollback** operations
- **Integrity guarantees** (checksum verification)

In [None]:
# Cell 4: System Info
display_system_info()

## 🔧 Setup <a id="setup"></a>

Let's set up our test environment for model registry comparison.

In [None]:
# Cell 6: Check Dependencies and Imports
import numpy as np
import time
import os
import shutil
import tempfile
import hashlib
from pathlib import Path
import matplotlib.pyplot as plt

# Check for SynaDB
HAS_SYNADB = check_dependency('synadb', 'pip install synadb')

# Check for MLflow
HAS_MLFLOW = check_dependency('mlflow', 'pip install mlflow')

# Check for DVC (optional - requires git repo)
HAS_DVC = check_dependency('dvc', 'pip install dvc')

# Apply consistent styling
setup_style()

In [None]:
# Cell 7: Configuration
# Test configuration
MODEL_SIZES_MB = [1, 10, 50]  # Model sizes to test
NUM_VERSIONS = 10             # Versions per model
SEED = 42                     # For reproducibility

print('Test Configuration:')
print(f'  Model sizes: {MODEL_SIZES_MB} MB')
print(f'  Versions per model: {NUM_VERSIONS}')

# Set seed for reproducibility
np.random.seed(SEED)

In [None]:
# Cell 8: Create Temp Directory
temp_dir = tempfile.mkdtemp(prefix='synadb_model_benchmark_')
print(f'Using temp directory: {temp_dir}')

# Paths for each system
synadb_path = os.path.join(temp_dir, 'synadb_models.db')
mlflow_path = os.path.join(temp_dir, 'mlruns')
dvc_path = os.path.join(temp_dir, 'dvc_models')

In [None]:
# Cell 9: Generate Test Model Data
# Generate model data of different sizes
model_data = {}
for size_mb in MODEL_SIZES_MB:
    # Create random bytes to simulate model weights
    model_data[size_mb] = np.random.bytes(size_mb * 1024 * 1024)
    print(f'✓ Generated {size_mb}MB model data')

# Generate metadata for each version
version_metadata = [
    {
        'accuracy': 0.85 + i * 0.01,
        'loss': 0.5 - i * 0.03,
        'epochs': 10 + i * 5,
        'learning_rate': 0.001 * (0.9 ** i)
    }
    for i in range(NUM_VERSIONS)
]
print(f'✓ Generated metadata for {NUM_VERSIONS} versions')

## ⚡ Benchmark: Model Save/Load <a id="benchmark-save-load"></a>

Let's measure how fast each system can save and load model versions.

In [None]:
# Cell 11: SynaDB Model Save Benchmark
synadb_save_times = {size: [] for size in MODEL_SIZES_MB}
synadb_registry = None

if HAS_SYNADB:
    from synadb import ModelRegistry
    
    print('Benchmarking SynaDB model save...')
    synadb_registry = ModelRegistry(synadb_path)
    
    for size_mb in MODEL_SIZES_MB:
        print(f'\n  Testing {size_mb}MB model...')
        for v in range(NUM_VERSIONS):
            # Time model save
            start = time.perf_counter()
            version = synadb_registry.save_model(
                f'model_{size_mb}mb',
                model_data[size_mb],
                {k: str(v) for k, v in version_metadata[v].items()}
            )
            elapsed = (time.perf_counter() - start) * 1000  # ms
            synadb_save_times[size_mb].append(elapsed)
        
        throughput = size_mb * NUM_VERSIONS * 1000 / sum(synadb_save_times[size_mb])
        print(f'    Mean: {np.mean(synadb_save_times[size_mb]):.2f}ms')
        print(f'    Throughput: {throughput:.1f} MB/s')
else:
    print('⚠️ SynaDB not available, skipping...')

In [None]:
# Cell 12: MLflow Model Save Benchmark
mlflow_save_times = {size: [] for size in MODEL_SIZES_MB}

if HAS_MLFLOW:
    import mlflow
    from mlflow.tracking import MlflowClient
    
    print('Benchmarking MLflow model save...')
    mlflow.set_tracking_uri(f'file://{mlflow_path}')
    client = MlflowClient()
    
    for size_mb in MODEL_SIZES_MB:
        print(f'\n  Testing {size_mb}MB model...')
        model_name = f'model_{size_mb}mb'
        
        # Create temp file for model
        model_file = os.path.join(temp_dir, f'temp_model_{size_mb}mb.bin')
        with open(model_file, 'wb') as f:
            f.write(model_data[size_mb])
        
        for v in range(NUM_VERSIONS):
            # Time model save via MLflow
            start = time.perf_counter()
            with mlflow.start_run():
                mlflow.log_artifact(model_file, 'model')
                for k, val in version_metadata[v].items():
                    mlflow.log_param(k, val)
            elapsed = (time.perf_counter() - start) * 1000  # ms
            mlflow_save_times[size_mb].append(elapsed)
        
        throughput = size_mb * NUM_VERSIONS * 1000 / sum(mlflow_save_times[size_mb])
        print(f'    Mean: {np.mean(mlflow_save_times[size_mb]):.2f}ms')
        print(f'    Throughput: {throughput:.1f} MB/s')
else:
    print('⚠️ MLflow not available, skipping...')

In [None]:
# Cell 13: DVC Model Save Benchmark (Simulated)
dvc_save_times = {size: [] for size in MODEL_SIZES_MB}

# Note: DVC requires a git repository, so we simulate the file operations
print('Benchmarking DVC-style model save (file operations only)...')

os.makedirs(dvc_path, exist_ok=True)

for size_mb in MODEL_SIZES_MB:
    print(f'\n  Testing {size_mb}MB model...')
    
    for v in range(NUM_VERSIONS):
        # Time file write + hash computation (DVC core operations)
        start = time.perf_counter()
        
        # Write model file
        model_file = os.path.join(dvc_path, f'model_{size_mb}mb_v{v}.bin')
        with open(model_file, 'wb') as f:
            f.write(model_data[size_mb])
        
        # Compute MD5 hash (DVC uses this for versioning)
        md5_hash = hashlib.md5(model_data[size_mb]).hexdigest()
        
        elapsed = (time.perf_counter() - start) * 1000  # ms
        dvc_save_times[size_mb].append(elapsed)
    
    throughput = size_mb * NUM_VERSIONS * 1000 / sum(dvc_save_times[size_mb])
    print(f'    Mean: {np.mean(dvc_save_times[size_mb]):.2f}ms')
    print(f'    Throughput: {throughput:.1f} MB/s')

In [None]:
# Cell 14: Model Save Results Visualization
# Compare save throughput for 10MB models
save_throughput = {}
test_size = 10  # Use 10MB for comparison

if synadb_save_times[test_size]:
    save_throughput['SynaDB'] = test_size * NUM_VERSIONS * 1000 / sum(synadb_save_times[test_size])

if mlflow_save_times[test_size]:
    save_throughput['MLflow'] = test_size * NUM_VERSIONS * 1000 / sum(mlflow_save_times[test_size])

if dvc_save_times[test_size]:
    save_throughput['DVC (file ops)'] = test_size * NUM_VERSIONS * 1000 / sum(dvc_save_times[test_size])

if save_throughput:
    fig = throughput_comparison(
        save_throughput,
        title=f'Model Save Throughput ({test_size}MB models)',
        ylabel='MB/second'
    )
    plt.show()
else:
    print('No save results to display.')

In [None]:
# Cell 15: SynaDB Model Load Benchmark
synadb_load_times = {size: [] for size in MODEL_SIZES_MB}

if HAS_SYNADB and synadb_registry:
    print('Benchmarking SynaDB model load...')
    
    for size_mb in MODEL_SIZES_MB:
        print(f'\n  Testing {size_mb}MB model...')
        for v in range(1, NUM_VERSIONS + 1):
            # Time model load with checksum verification
            start = time.perf_counter()
            data, info = synadb_registry.load_model(f'model_{size_mb}mb', version=v)
            elapsed = (time.perf_counter() - start) * 1000  # ms
            synadb_load_times[size_mb].append(elapsed)
        
        throughput = size_mb * NUM_VERSIONS * 1000 / sum(synadb_load_times[size_mb])
        print(f'    Mean: {np.mean(synadb_load_times[size_mb]):.2f}ms')
        print(f'    Throughput: {throughput:.1f} MB/s')
else:
    print('⚠️ SynaDB not available, skipping...')

In [None]:
# Cell 16: Model Load Results Visualization
load_throughput = {}
test_size = 10  # Use 10MB for comparison

if synadb_load_times[test_size]:
    load_throughput['SynaDB'] = test_size * NUM_VERSIONS * 1000 / sum(synadb_load_times[test_size])

# For MLflow and DVC, we'd need to implement load benchmarks
# For now, show SynaDB results
if load_throughput:
    fig = throughput_comparison(
        load_throughput,
        title=f'Model Load Throughput ({test_size}MB models)',
        ylabel='MB/second'
    )
    plt.show()
else:
    print('No load results to display.')

## 📋 Demo: Version Management <a id="demo-version"></a>

Let's demonstrate how each system handles model versioning.

In [None]:
# Cell 18: SynaDB Version Management Demo
if HAS_SYNADB and synadb_registry:
    print('SynaDB Version Management')
    print('=' * 50)
    
    # List all versions of a model
    versions = synadb_registry.list_versions('model_10mb')
    print(f'\nModel: model_10mb')
    print(f'Total versions: {len(versions)}')
    print('\nVersion History:')
    for v in versions[:5]:  # Show first 5
        print(f'  v{v.version}: {v.stage} - checksum: {v.checksum[:16]}...')
    
    # Get specific version
    print('\n\nLoading specific version (v3)...')
    data, info = synadb_registry.load_model('model_10mb', version=3)
    print(f'  Loaded {len(data) / (1024*1024):.1f}MB')
    print(f'  Stage: {info.stage}')
    print(f'  Checksum verified: ✓')
else:
    print('⚠️ SynaDB not available')

In [None]:
# Cell 19: Version Management Comparison Table
from IPython.display import display, Markdown

comparison = '''
### Version Management Comparison

| Feature | SynaDB | MLflow | DVC |
|---------|--------|--------|-----|
| **Auto-versioning** | ✅ Automatic | ✅ Automatic | ⚠️ Manual commits |
| **Version numbering** | ✅ Sequential | ✅ Sequential | ⚠️ Git hashes |
| **Metadata per version** | ✅ Built-in | ✅ Built-in | ⚠️ Separate files |
| **Query versions** | ✅ Fast (indexed) | ✅ API calls | ⚠️ Git log parsing |
| **Storage efficiency** | ✅ Single file | ⚠️ Directory per run | ⚠️ Cache directory |
| **Offline support** | ✅ Full | ⚠️ Local mode | ✅ Full |
'''
display(Markdown(comparison))

## 🚀 Demo: Stage Promotion <a id="demo-stage"></a>

Model lifecycle management: Development → Staging → Production

In [None]:
# Cell 21: SynaDB Stage Promotion Demo
if HAS_SYNADB and synadb_registry:
    print('SynaDB Stage Promotion Workflow')
    print('=' * 50)
    
    model_name = 'model_10mb'
    
    # Check current stages
    print('\nCurrent version stages:')
    versions = synadb_registry.list_versions(model_name)
    for v in versions[:3]:
        print(f'  v{v.version}: {v.stage}')
    
    # Promote version 5 to Staging
    print('\n\nPromoting v5 to Staging...')
    synadb_registry.set_stage(model_name, 5, 'Staging')
    print('  ✓ v5 is now in Staging')
    
    # Promote version 8 to Production
    print('\nPromoting v8 to Production...')
    synadb_registry.set_stage(model_name, 8, 'Production')
    print('  ✓ v8 is now in Production')
    
    # Get production model
    print('\n\nGetting production model...')
    prod = synadb_registry.get_production(model_name)
    if prod:
        print(f'  Production version: v{prod.version}')
        print(f'  Checksum: {prod.checksum[:16]}...')
    
    # Show updated stages
    print('\n\nUpdated version stages:')
    versions = synadb_registry.list_versions(model_name)
    for v in versions:
        if v.stage != 'Development':
            print(f'  v{v.version}: {v.stage}')
else:
    print('⚠️ SynaDB not available')

In [None]:
# Cell 22: Stage Promotion Comparison
from IPython.display import display, Markdown

stage_comparison = '''
### Stage Promotion Comparison

| Feature | SynaDB | MLflow | DVC |
|---------|--------|--------|-----|
| **Built-in stages** | ✅ Dev/Staging/Prod/Archived | ✅ Similar | ❌ None |
| **Promotion API** | ✅ `set_stage()` | ✅ `transition_model_version_stage()` | ❌ Manual |
| **Get production** | ✅ `get_production()` | ✅ Filter by stage | ❌ Manual |
| **Stage history** | ✅ Tracked | ✅ Tracked | ❌ Git history |
| **Approval workflow** | ⚠️ Manual | ✅ Built-in | ❌ None |

**SynaDB Stages:**
- `Development` - Initial stage for new models
- `Staging` - Testing/validation stage
- `Production` - Live deployment stage
- `Archived` - Deprecated models
'''
display(Markdown(stage_comparison))

## ⏪ Demo: Rollback <a id="demo-rollback"></a>

Demonstrating how to rollback to a previous model version.

In [None]:
# Cell 24: SynaDB Rollback Demo
if HAS_SYNADB and synadb_registry:
    print('SynaDB Rollback Workflow')
    print('=' * 50)
    
    model_name = 'model_10mb'
    
    # Current production
    print('\nCurrent production model:')
    prod = synadb_registry.get_production(model_name)
    if prod:
        print(f'  Version: v{prod.version}')
    
    # Simulate rollback: demote current, promote previous
    print('\n\nRolling back to v5...')
    
    # Archive current production
    if prod:
        synadb_registry.set_stage(model_name, prod.version, 'Archived')
        print(f'  ✓ v{prod.version} archived')
    
    # Promote v5 to production
    synadb_registry.set_stage(model_name, 5, 'Production')
    print('  ✓ v5 promoted to Production')
    
    # Verify rollback
    print('\n\nVerifying rollback...')
    new_prod = synadb_registry.get_production(model_name)
    if new_prod:
        print(f'  New production version: v{new_prod.version}')
        
        # Load and verify integrity
        data, info = synadb_registry.load_model(model_name, version=new_prod.version)
        print(f'  Integrity verified: ✓ (SHA-256 checksum)')
        print(f'  Model size: {len(data) / (1024*1024):.1f}MB')
else:
    print('⚠️ SynaDB not available')

In [None]:
# Cell 25: Rollback Comparison
from IPython.display import display, Markdown

rollback_comparison = '''
### Rollback Comparison

| Feature | SynaDB | MLflow | DVC |
|---------|--------|--------|-----|
| **Rollback method** | Stage change | Stage change | `dvc checkout` |
| **Speed** | ✅ Instant | ✅ Instant | ⚠️ File copy |
| **Integrity check** | ✅ SHA-256 | ⚠️ Optional | ✅ MD5 |
| **Audit trail** | ✅ Tracked | ✅ Tracked | ✅ Git history |
| **Atomic operation** | ✅ Yes | ✅ Yes | ⚠️ Multi-step |
'''
display(Markdown(rollback_comparison))

## 🔒 Integrity Guarantees <a id="integrity"></a>

Comparing data integrity features across systems.

In [None]:
# Cell 27: SynaDB Integrity Demo
if HAS_SYNADB and synadb_registry:
    print('SynaDB Integrity Guarantees')
    print('=' * 50)
    
    model_name = 'model_10mb'
    
    # Load model with checksum verification
    print('\nLoading model with integrity verification...')
    data, info = synadb_registry.load_model(model_name, version=1)
    
    print(f'\n  Model: {model_name}')
    print(f'  Version: v{info.version}')
    print(f'  Size: {len(data) / (1024*1024):.1f}MB')
    print(f'  Checksum (SHA-256): {info.checksum}')
    print(f'  Verification: ✓ Automatic on load')
    
    # Verify checksum manually
    print('\n\nManual checksum verification...')
    computed_hash = hashlib.sha256(data).hexdigest()
    matches = computed_hash == info.checksum
    print(f'  Computed: {computed_hash[:32]}...')
    print(f'  Stored:   {info.checksum[:32]}...')
    print(f'  Match: {"✓" if matches else "✗"}')
else:
    print('⚠️ SynaDB not available')

In [None]:
# Cell 28: Integrity Comparison Table
from IPython.display import display, Markdown

integrity_comparison = '''
### Integrity Guarantee Comparison

| Feature | SynaDB | MLflow | DVC |
|---------|--------|--------|-----|
| **Checksum algorithm** | SHA-256 | None (optional) | MD5 |
| **Auto-verification** | ✅ On every load | ❌ Manual | ✅ On checkout |
| **Corruption detection** | ✅ Immediate | ❌ None | ✅ On access |
| **Tamper detection** | ✅ Cryptographic | ❌ None | ⚠️ MD5 (weak) |
| **Checksum storage** | ✅ With model | ❌ Separate | ✅ .dvc files |

**Why SHA-256?**
- Cryptographically secure (unlike MD5)
- Detects both accidental corruption and tampering
- Industry standard for data integrity
- Fast enough for large models
'''
display(Markdown(integrity_comparison))

## 📊 Results Summary <a id="results"></a>

Let's summarize all benchmark results and comparisons.

In [None]:
# Cell 30: Results Summary Table
from IPython.display import display, Markdown

# Build summary
summary_lines = ['### Performance Summary\n']
summary_lines.append('| Metric | SynaDB | MLflow | DVC |')
summary_lines.append('|--------|--------|--------|-----|')

# Save throughput (10MB)
test_size = 10
if synadb_save_times.get(test_size):
    synadb_tp = test_size * NUM_VERSIONS * 1000 / sum(synadb_save_times[test_size])
else:
    synadb_tp = 'N/A'

if mlflow_save_times.get(test_size):
    mlflow_tp = test_size * NUM_VERSIONS * 1000 / sum(mlflow_save_times[test_size])
else:
    mlflow_tp = 'N/A'

if dvc_save_times.get(test_size):
    dvc_tp = test_size * NUM_VERSIONS * 1000 / sum(dvc_save_times[test_size])
else:
    dvc_tp = 'N/A'

synadb_str = f'{synadb_tp:.1f} MB/s' if isinstance(synadb_tp, float) else synadb_tp
mlflow_str = f'{mlflow_tp:.1f} MB/s' if isinstance(mlflow_tp, float) else mlflow_tp
dvc_str = f'{dvc_tp:.1f} MB/s' if isinstance(dvc_tp, float) else dvc_tp

summary_lines.append(f'| Save throughput (10MB) | **{synadb_str}** | {mlflow_str} | {dvc_str} |')

# Add feature comparison
summary_lines.append('| Checksum verification | ✅ SHA-256 | ❌ None | ✅ MD5 |')
summary_lines.append('| Stage management | ✅ Built-in | ✅ Built-in | ❌ None |')
summary_lines.append('| Single file storage | ✅ Yes | ❌ Directory | ❌ Cache |')
summary_lines.append('| Offline support | ✅ Full | ⚠️ Local mode | ✅ Full |')

display(Markdown('\n'.join(summary_lines)))

In [None]:
# Cell 31: Feature Comparison Chart
# Create a feature comparison visualization
features = ['Embedded', 'Checksums', 'Stages', 'Offline', 'Single File']
synadb_scores = [1, 1, 1, 1, 1]  # All features
mlflow_scores = [0, 0, 1, 0.5, 0]  # Server-based, no checksums, has stages
dvc_scores = [1, 0.5, 0, 1, 0]  # Embedded, MD5, no stages

x = np.arange(len(features))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width, synadb_scores, width, label='SynaDB', color=COLORS['synadb'])
bars2 = ax.bar(x, mlflow_scores, width, label='MLflow', color=COLORS['competitor'])
bars3 = ax.bar(x + width, dvc_scores, width, label='DVC', color=COLORS['competitor_alt'])

ax.set_ylabel('Feature Support')
ax.set_title('Model Registry Feature Comparison')
ax.set_xticks(x)
ax.set_xticklabels(features)
ax.set_ylim(0, 1.2)
ax.legend()
ax.set_yticks([0, 0.5, 1])
ax.set_yticklabels(['None', 'Partial', 'Full'])

plt.tight_layout()
plt.show()

## 🎯 Conclusions <a id="conclusions"></a>

In [None]:
# Cell 33: Conclusions
conclusion_box(
    title='Key Takeaways',
    points=[
        'SynaDB provides embedded model registry with SHA-256 integrity verification',
        'Single-file storage simplifies deployment and backup',
        'Built-in stage management (Dev → Staging → Production → Archived)',
        'Offline-first design - no server or network required',
        'MLflow offers richer UI but requires server infrastructure',
        'DVC integrates with Git but lacks built-in stage management'
    ],
    summary='SynaDB is ideal for embedded ML applications needing simple, reliable model versioning.'
)

In [None]:
# Cell 34: Cleanup
# Clean up temp directory
import shutil
try:
    shutil.rmtree(temp_dir)
    print(f'✓ Cleaned up temp directory: {temp_dir}')
except Exception as e:
    print(f'⚠️ Could not clean up: {e}')

---

**Next Steps:**
- Try the [Hugging Face Hub comparison](11_huggingface_hub.ipynb) for transformer model storage
- Explore [LLM Framework integrations](../llm_frameworks/) for RAG applications
- Check out [End-to-End Pipeline](../specialized/18_end_to_end_pipeline.ipynb) for complete ML workflows