In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box, info_box, warning_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, throughput_comparison, COLORS

display_header('Experiment Tracking Comparison', 'SynaDB vs Neptune vs ClearML')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('API Comparison', 'api-comparison'),
    ('Hardware Tracking', 'hardware-tracking'),
    ('Dataset Versioning', 'dataset-versioning'),
    ('Feature Comparison Matrix', 'feature-matrix'),
    ('Self-Hosting Comparison', 'self-hosting'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## 📌 Introduction <a id="introduction"></a>

This notebook compares **SynaDB's ExperimentTracker** against **Neptune** and **ClearML**, two popular MLOps platforms.

| System | Type | Key Features |
|--------|------|-------------|
| **SynaDB** | Embedded | Single-file, zero config, offline-first, free |
| **Neptune** | Cloud/Self-hosted | Rich metadata, collaboration, integrations |
| **ClearML** | Cloud/Self-hosted | Open-source, pipelines, data management |

### What We'll Compare

- **API design** and ease of use
- **Hardware tracking** capabilities
- **Dataset versioning** patterns
- **Feature comparison** matrix
- **Self-hosting** options

### Important Note

Neptune and ClearML require API keys and/or server setup. This notebook demonstrates patterns and compares approaches, with actual benchmarks only running if credentials are available.

In [None]:
# Cell 4: System Info
display_system_info()

## 🔧 Setup <a id="setup"></a>

Let's set up our test environment for experiment tracking comparison.

In [None]:
# Cell 6: Check Dependencies and Imports
import numpy as np
import time
import os
import shutil
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt
import json

# Check for SynaDB
HAS_SYNADB = check_dependency('synadb', 'pip install synadb')

# Check for Neptune
HAS_NEPTUNE = check_dependency('neptune', 'pip install neptune')

# Check for ClearML
HAS_CLEARML = check_dependency('clearml', 'pip install clearml')

# Apply consistent styling
setup_style()

In [None]:
# Cell 7: Configuration
# Test configuration
NUM_EPOCHS = 50         # Epochs per run
NUM_METRICS = 5         # Metrics per epoch
SEED = 42               # For reproducibility

print(f'Test Configuration:')
print(f'  Epochs: {NUM_EPOCHS}')
print(f'  Metrics per epoch: {NUM_METRICS}')

# Set seed for reproducibility
np.random.seed(SEED)

# Create temp directory
temp_dir = tempfile.mkdtemp(prefix='synadb_neptune_clearml_')
print(f'\nUsing temp directory: {temp_dir}')

# Paths for SynaDB
synadb_path = os.path.join(temp_dir, 'synadb_experiments.db')

## 🔌 API Comparison <a id="api-comparison"></a>

Let's compare the API design and ease of use across all three platforms.

In [None]:
# Cell 9: API Comparison Demonstration
from IPython.display import display, Markdown, HTML

api_comparison = '''
### API Design Comparison

#### SynaDB - Simple and Direct

```python
from synadb import ExperimentTracker

# Initialize - just a file path!
tracker = ExperimentTracker("experiments.db")

# Start run
run_id = tracker.start_run("my_experiment", tags=["baseline"])

# Log parameters
tracker.log_param(run_id, "learning_rate", "0.001")
tracker.log_param(run_id, "batch_size", "32")

# Log metrics
for epoch in range(100):
    tracker.log_metric(run_id, "loss", loss_value, step=epoch)
    tracker.log_metric(run_id, "accuracy", acc_value, step=epoch)

# Log artifact
tracker.log_artifact(run_id, "model.pt", model_bytes)

# End run
tracker.end_run(run_id, "Completed")
```

#### Neptune - Namespace-based

```python
import neptune

# Initialize - requires API token
run = neptune.init_run(
    project="workspace/project",
    api_token="YOUR_API_TOKEN"
)

# Log parameters (namespace style)
run["parameters/learning_rate"] = 0.001
run["parameters/batch_size"] = 32

# Log metrics (append style)
for epoch in range(100):
    run["metrics/loss"].append(loss_value)
    run["metrics/accuracy"].append(acc_value)

# Log artifact
run["artifacts/model"].upload("model.pt")

# Stop run
run.stop()
```

#### ClearML - Auto-logging

```python
from clearml import Task

# Initialize - auto-detects many frameworks
task = Task.init(
    project_name="my_project",
    task_name="my_experiment"
)

# Log parameters (dict style)
task.connect({"learning_rate": 0.001, "batch_size": 32})

# Log metrics (via Logger)
logger = task.get_logger()
for epoch in range(100):
    logger.report_scalar("loss", "train", loss_value, epoch)
    logger.report_scalar("accuracy", "train", acc_value, epoch)

# Log artifact
task.upload_artifact("model", "model.pt")

# Close task
task.close()
```
'''

display(Markdown(api_comparison))

In [None]:
# Cell 10: SynaDB API Demonstration
if HAS_SYNADB:
    from synadb import ExperimentTracker
    
    # Initialize tracker
    tracker = ExperimentTracker(synadb_path)
    
    # Start a run
    run_id = tracker.start_run('api_demo', tags=['demo', 'comparison'])
    print(f'Started run: {run_id}')
    
    # Log parameters
    tracker.log_param(run_id, 'learning_rate', '0.001')
    tracker.log_param(run_id, 'batch_size', '32')
    tracker.log_param(run_id, 'optimizer', 'adam')
    print('Logged parameters')
    
    # Log metrics
    for epoch in range(10):
        loss = 1.0 / (epoch + 1)
        acc = 0.5 + 0.05 * epoch
        tracker.log_metric(run_id, 'loss', loss, step=epoch)
        tracker.log_metric(run_id, 'accuracy', acc, step=epoch)
    print('Logged metrics for 10 epochs')
    
    # Log artifact
    model_bytes = b'fake_model_weights' * 100
    tracker.log_artifact(run_id, 'model.bin', model_bytes)
    print('Logged artifact')
    
    # End run
    tracker.end_run(run_id, 'Completed')
    print('Run completed!')
    
    # Query the run
    run = tracker.get_run(run_id)
    print(f'\nRun details:')
    print(f'  Experiment: {run.experiment}')
    print(f'  Status: {run.status}')
    print(f'  Tags: {run.tags}')
else:
    warning_box('SynaDB not available - skipping demo')

## 🖥️ Hardware Tracking <a id="hardware-tracking"></a>

Hardware tracking is essential for reproducibility. Let's compare how each platform handles this.

In [None]:
# Cell 12: Hardware Tracking Comparison
hardware_comparison = '''
### Hardware Tracking Capabilities

| Feature | SynaDB | Neptune | ClearML |
|---------|--------|---------|--------|
| **CPU Info** | Manual | Auto | Auto |
| **GPU Info** | Manual | Auto | Auto |
| **Memory Usage** | Manual | Auto | Auto |
| **GPU Memory** | Manual | Auto | Auto |
| **Real-time Monitoring** | ❌ | ✅ | ✅ |
| **Custom Metrics** | ✅ | ✅ | ✅ |
| **Offline Support** | ✅ | Limited | Limited |

#### SynaDB Approach

SynaDB focuses on simplicity - you log what you need:

```python
import psutil
import torch

# Log system info as parameters
tracker.log_param(run_id, "cpu_count", str(psutil.cpu_count()))
tracker.log_param(run_id, "ram_gb", str(psutil.virtual_memory().total // 1e9))

if torch.cuda.is_available():
    tracker.log_param(run_id, "gpu_name", torch.cuda.get_device_name(0))
    tracker.log_param(run_id, "gpu_memory_gb", str(torch.cuda.get_device_properties(0).total_memory // 1e9))
```

#### Neptune/ClearML Approach

Both platforms auto-capture hardware info, but require network connectivity.
'''

display(Markdown(hardware_comparison))

In [None]:
# Cell 13: Hardware Tracking Demo with SynaDB
import platform

if HAS_SYNADB:
    # Start a new run for hardware tracking demo
    hw_run_id = tracker.start_run('hardware_demo', tags=['hardware'])
    
    # Log system information
    tracker.log_param(hw_run_id, 'python_version', platform.python_version())
    tracker.log_param(hw_run_id, 'platform', platform.platform())
    tracker.log_param(hw_run_id, 'processor', platform.processor())
    
    # Try to get more detailed info
    try:
        import psutil
        tracker.log_param(hw_run_id, 'cpu_count', str(psutil.cpu_count()))
        tracker.log_param(hw_run_id, 'ram_total_gb', f'{psutil.virtual_memory().total / 1e9:.1f}')
        print('Logged CPU and RAM info via psutil')
    except ImportError:
        print('psutil not available - skipping detailed CPU/RAM info')
    
    # Try to get GPU info
    try:
        import torch
        if torch.cuda.is_available():
            tracker.log_param(hw_run_id, 'gpu_name', torch.cuda.get_device_name(0))
            tracker.log_param(hw_run_id, 'gpu_count', str(torch.cuda.device_count()))
            print('Logged GPU info via PyTorch')
        else:
            tracker.log_param(hw_run_id, 'gpu_available', 'false')
            print('No GPU available')
    except ImportError:
        print('PyTorch not available - skipping GPU info')
    
    tracker.end_run(hw_run_id, 'Completed')
    print('\nHardware tracking demo completed!')
else:
    warning_box('SynaDB not available')

## 📦 Dataset Versioning <a id="dataset-versioning"></a>

Dataset versioning is crucial for ML reproducibility. Let's compare approaches.

In [None]:
# Cell 15: Dataset Versioning Comparison
dataset_comparison = '''
### Dataset Versioning Approaches

| Feature | SynaDB | Neptune | ClearML |
|---------|--------|---------|--------|
| **Built-in Versioning** | Via artifacts | ✅ | ✅ |
| **Large File Support** | ✅ (chunked) | ✅ | ✅ |
| **Deduplication** | ❌ | ✅ | ✅ |
| **Remote Storage** | ❌ | ✅ | ✅ |
| **Local-first** | ✅ | ❌ | ❌ |
| **Zero Config** | ✅ | ❌ | ❌ |

#### SynaDB Pattern

```python
# Store dataset as artifact with version in name
tracker.log_artifact(run_id, "dataset_v1.npz", dataset_bytes)

# Or use ModelRegistry for versioned storage
from synadb import ModelRegistry
registry = ModelRegistry("datasets.db")
registry.save_model("mnist", dataset_bytes, {"samples": "60000"})
```

#### Neptune Pattern

```python
run["datasets/train"].track_files("data/train/")
run["datasets/train"].upload_files("data/train/")
```

#### ClearML Pattern

```python
from clearml import Dataset
dataset = Dataset.create(dataset_name="mnist", dataset_project="datasets")
dataset.add_files("data/train/")
dataset.upload()
dataset.finalize()
```
'''

display(Markdown(dataset_comparison))

In [None]:
# Cell 16: Dataset Versioning Demo with SynaDB
if HAS_SYNADB:
    from synadb import ModelRegistry
    
    # Create a registry for datasets
    dataset_registry_path = os.path.join(temp_dir, 'datasets.db')
    dataset_registry = ModelRegistry(dataset_registry_path)
    
    # Create sample dataset
    sample_data = np.random.randn(1000, 10).astype(np.float32)
    dataset_bytes = sample_data.tobytes()
    
    # Save dataset version 1
    v1 = dataset_registry.save_model('sample_dataset', dataset_bytes, {
        'samples': '1000',
        'features': '10',
        'dtype': 'float32'
    })
    print(f'Saved dataset v{v1.version}')
    
    # Create updated dataset
    sample_data_v2 = np.random.randn(2000, 10).astype(np.float32)
    dataset_bytes_v2 = sample_data_v2.tobytes()
    
    # Save dataset version 2
    v2 = dataset_registry.save_model('sample_dataset', dataset_bytes_v2, {
        'samples': '2000',
        'features': '10',
        'dtype': 'float32'
    })
    print(f'Saved dataset v{v2.version}')
    
    # List versions
    versions = dataset_registry.list_versions('sample_dataset')
    print(f'\nDataset versions:')
    for v in versions:
        print(f'  v{v.version}: {v.metadata}')
    
    # Load specific version with checksum verification
    data, info = dataset_registry.load_model('sample_dataset', version=1)
    print(f'\nLoaded v1: {len(data)} bytes, checksum verified!')
else:
    warning_box('SynaDB not available')

## 📊 Feature Comparison Matrix <a id="feature-matrix"></a>

A comprehensive comparison of features across all three platforms.

In [None]:
# Cell 18: Feature Comparison Matrix
feature_matrix = '''
### Complete Feature Matrix

| Category | Feature | SynaDB | Neptune | ClearML |
|----------|---------|--------|---------|--------|
| **Setup** | Zero config | ✅ | ❌ | ❌ |
| | Single file | ✅ | ❌ | ❌ |
| | No account required | ✅ | ❌ | ❌ |
| | Offline-first | ✅ | ❌ | ❌ |
| **Tracking** | Parameters | ✅ | ✅ | ✅ |
| | Metrics | ✅ | ✅ | ✅ |
| | Artifacts | ✅ | ✅ | ✅ |
| | Source code | ❌ | ✅ | ✅ |
| | Git integration | ❌ | ✅ | ✅ |
| | Auto-logging | ❌ | ✅ | ✅ |
| **Visualization** | Built-in UI | ❌ | ✅ | ✅ |
| | Custom charts | Via export | ✅ | ✅ |
| | Comparison views | Via export | ✅ | ✅ |
| **Collaboration** | Team sharing | ❌ | ✅ | ✅ |
| | Comments | ❌ | ✅ | ✅ |
| | Access control | ❌ | ✅ | ✅ |
| **MLOps** | Pipelines | ❌ | ❌ | ✅ |
| | Model serving | ❌ | ❌ | ✅ |
| | Data management | ❌ | ✅ | ✅ |
| **Cost** | Free tier | ✅ Unlimited | Limited | Limited |
| | Self-hosted | N/A | ✅ | ✅ |
| | Open source | ✅ | ❌ | ✅ |
| **Performance** | Local speed | ✅ Fast | Network | Network |
| | Large artifacts | ✅ | ✅ | ✅ |
| | Query speed | ✅ Fast | Depends | Depends |
'''

display(Markdown(feature_matrix))

In [None]:
# Cell 19: When to Use Each Platform
when_to_use = '''
### When to Use Each Platform

#### Choose SynaDB When:
- 🏠 **Local development** - No server setup needed
- 🔒 **Privacy matters** - Data never leaves your machine
- ✈️ **Offline work** - Works without internet
- 💰 **Budget constraints** - Completely free
- 🚀 **Quick prototyping** - Zero config, instant start
- 📦 **Embedded use** - Part of your application

#### Choose Neptune When:
- 👥 **Team collaboration** - Share experiments easily
- 📊 **Rich visualization** - Built-in dashboards
- 🔗 **Integrations** - Many framework integrations
- 📝 **Metadata-heavy** - Complex experiment organization

#### Choose ClearML When:
- 🔄 **Full MLOps** - Pipelines, serving, data management
- 🏢 **Enterprise** - Self-hosted with full control
- 🤖 **Auto-logging** - Minimal code changes
- 📦 **Data versioning** - Built-in dataset management
'''

display(Markdown(when_to_use))

## 🏠 Self-Hosting Comparison <a id="self-hosting"></a>

For organizations that need to keep data on-premises, self-hosting options matter.

In [None]:
# Cell 21: Self-Hosting Comparison
self_hosting = '''
### Self-Hosting Options

| Aspect | SynaDB | Neptune | ClearML |
|--------|--------|---------|--------|
| **Deployment** | N/A (embedded) | Enterprise only | Open source |
| **Infrastructure** | None | Kubernetes | Docker/K8s |
| **Database** | Single file | PostgreSQL | MongoDB + Redis |
| **Storage** | Local disk | S3/GCS/Azure | S3/GCS/Azure |
| **Complexity** | ⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐ |
| **Cost** | Free | Enterprise license | Free (open source) |

#### SynaDB: No Server Needed

```python
# SynaDB is embedded - no server to deploy!
from synadb import ExperimentTracker
tracker = ExperimentTracker("/shared/nfs/experiments.db")
# That\'s it - works on any shared filesystem
```

#### ClearML Self-Hosted

```bash
# Requires Docker Compose with multiple services
docker-compose -f docker-compose.yml up -d
# Services: webserver, apiserver, fileserver, mongodb, redis, elasticsearch
```

#### Neptune Self-Hosted

```bash
# Enterprise-only, requires Kubernetes
helm install neptune neptune/neptune-server
# Plus: PostgreSQL, object storage, load balancer
```
'''

display(Markdown(self_hosting))

## 📈 Results Summary <a id="results"></a>

Let's summarize our findings from this comparison.

In [None]:
# Cell 23: Results Summary
# Create comparison visualization
categories = ['Setup\nSimplicity', 'Offline\nSupport', 'Collaboration', 'MLOps\nFeatures', 'Cost\nEfficiency']
synadb_scores = [5, 5, 1, 2, 5]
neptune_scores = [2, 1, 5, 3, 3]
clearml_scores = [2, 1, 5, 5, 4]

x = np.arange(len(categories))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, synadb_scores, width, label='SynaDB', color=COLORS['synadb'])
bars2 = ax.bar(x, neptune_scores, width, label='Neptune', color=COLORS['competitor1'])
bars3 = ax.bar(x + width, clearml_scores, width, label='ClearML', color=COLORS['competitor2'])

ax.set_ylabel('Score (1-5)')
ax.set_title('Platform Comparison by Category')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
ax.set_ylim(0, 6)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print('\nScoring: 5=Excellent, 4=Good, 3=Average, 2=Limited, 1=Poor')

In [None]:
# Cell 24: Key Findings
findings = '''
### Key Findings

| Metric | Winner | Notes |
|--------|--------|-------|
| **Setup Time** | SynaDB | Zero config vs server setup |
| **Offline Support** | SynaDB | Works without internet |
| **Team Collaboration** | Neptune/ClearML | Built-in sharing features |
| **MLOps Features** | ClearML | Full pipeline support |
| **Cost** | SynaDB | Completely free |
| **Privacy** | SynaDB | Data stays local |
| **Auto-logging** | ClearML | Best framework integration |
| **Visualization** | Neptune | Rich built-in dashboards |

### Performance Notes

- **SynaDB**: Local disk speed, no network latency
- **Neptune**: Network-dependent, optimized for cloud
- **ClearML**: Network-dependent, can be self-hosted
'''

display(Markdown(findings))

## 🎯 Conclusions <a id="conclusions"></a>

In [None]:
# Cell 26: Conclusions
conclusion_box('''
### SynaDB vs Neptune vs ClearML

**SynaDB excels at:**
- Zero-config local experiment tracking
- Offline-first development workflows
- Privacy-sensitive environments
- Embedded use cases
- Cost-conscious projects

**Neptune excels at:**
- Team collaboration and sharing
- Rich visualization and dashboards
- Metadata organization
- Enterprise integrations

**ClearML excels at:**
- Full MLOps pipelines
- Auto-logging capabilities
- Self-hosted deployments
- Data and model management

**Recommendation:**
- Use **SynaDB** for local development, prototyping, and privacy-first workflows
- Use **Neptune** when team collaboration and visualization are priorities
- Use **ClearML** when you need full MLOps capabilities

All three can coexist - use SynaDB locally, then export to Neptune/ClearML for team sharing!
''')

In [None]:
# Cell 27: Cleanup
print('Cleaning up temporary files...')
try:
    shutil.rmtree(temp_dir)
    print(f'Removed: {temp_dir}')
except Exception as e:
    print(f'Cleanup warning: {e}')

print('\n✅ Notebook completed successfully!')